From 4d2cde9f7ac7984cb6c9023988590c8961909a70 Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Thu, 24 Mar 2016 21:37:36 -0400 Subject: [PATCH 001/174] chain-smbr: Adding chain-smbr denominator --- src/chain/chain-denominator-smbr.cc | 388 ++++++++++++++++++++++++++++ src/chain/chain-denominator-smbr.h | 259 +++++++++++++++++++ src/chain/chain-denominator.h | 4 +- 3 files changed, 650 insertions(+), 1 deletion(-) create mode 100644 src/chain/chain-denominator-smbr.cc create mode 100644 src/chain/chain-denominator-smbr.h diff --git a/src/chain/chain-denominator-smbr.cc b/src/chain/chain-denominator-smbr.cc new file mode 100644 index 00000000000..6ed1a4e18da --- /dev/null +++ b/src/chain/chain-denominator-smbr.cc @@ -0,0 +1,388 @@ +// chain/chain-denominator-smbr.cc + +// Copyright 2015 Johns Hopkins University (author: Daniel Povey) +// 2016 Vimal Manohar + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + + +#include "chain/chain-denominator-smbr.h" +#include "chain/chain-kernels-ansi.h" + +namespace kaldi { +namespace chain { + +DenominatorSmbrComputation::DenominatorSmbrComputation( + const ChainTrainingOptions &opts, + const DenominatorGraph &den_graph, + int32 num_sequences, + const CuMatrixBase &nnet_output): + opts_(opts), + den_graph_(den_graph), + num_sequences_(num_sequences), + frames_per_sequence_(nnet_output.NumRows() / num_sequences_), + exp_nnet_output_transposed_(nnet_output, kTrans), + nnet_output_deriv_transposed_( + exp_nnet_output_transposed_.NumRows(), + std::min(exp_nnet_output_transposed_.NumCols(), + static_cast(kMaxDerivTimeSteps) * + num_sequences_)), + alpha_(frames_per_sequence_ + 1, + den_graph_.NumStates() * num_sequences_ + num_sequences_, + kUndefined), + beta_(2, den_graph_.NumStates() * num_sequences_ + num_sequences_, + kUndefined), + tot_prob_(num_sequences_, kUndefined), + tot_log_prob_(num_sequences_, kUndefined), + log_correction_term_(num_sequences_, kUndefined), + ok_(true), + alpha_smbr_(frames_per_sequence_ + 1, + den_graph_.NumStates() * num_sequences_ + num_sequences_, + kUndefined), + beta_smbr_(2, den_graph_.NumStates() * num_sequences_ + num_sequences_, + kUndefined), + tot_objf_(num_sequences_, kUndefined) { + KALDI_ASSERT(opts_.leaky_hmm_coefficient > 0.0 && + opts_.leaky_hmm_coefficient < 1.0); + // make sure the alpha sums and beta sums are zeroed. + alpha_.ColRange(den_graph_.NumStates() * num_sequences_, + num_sequences_).SetZero(); + alpha_smbr_.ColRange(den_graph_.NumStates() * num_sequences_, + num_sequences_).SetZero(); + beta_smbr_.ColRange(den_graph_.NumStates() * num_sequences_, + num_sequences_).SetZero(); + + KALDI_ASSERT(nnet_output.NumRows() % num_sequences == 0); + exp_nnet_output_transposed_.ApplyExp(); +} + + +void DenominatorSmbrComputation::AlphaSmbrFirstFrame() { + // dim == num_hmm_states_ * num_sequences_. + BaseFloat *first_frame_alpha_smbr = alpha_smbr_.RowData(0); + // create a 'fake matrix' - view this row as a matrix. + // initializer takes [pointer, num-rows, num-cols, stride]. + CuSubMatrix alpha_smbr_mat(first_frame_alpha_smbr, + den_graph_.NumStates(), + num_sequences_, + num_sequences_); + // TODO (possible): It would be more efficient here if we implemented a + // CopyColsFromVec function in class CuMatrix. + alpha_smbr_mat.SetZero(); +} + + +// the alpha computation for some 0 < t <= num_time_steps_. +void DenominatorSmbrComputation::AlphaSmbrGeneralFrame(int32 t) { + KALDI_ASSERT(t > 0 && t <= frames_per_sequence_); + BaseFloat *this_alpha = alpha_.RowData(t); + BaseFloat *this_alpha_smbr = alpha_smbr_.RowData(t); + const BaseFloat *prev_alpha_dash = alpha_.RowData(t - 1); + const BaseFloat *prev_prev_alpha_dash = (t > 1 ? alpha_.RowData(t - 2) : NULL); + const Int32Pair *backward_transitions = den_graph_.BackwardTransitions(); + const DenominatorGraphTransition *transitions = den_graph_.Transitions(); + int32 num_pdfs = exp_nnet_output_transposed_.NumRows(), + num_hmm_states = den_graph_.NumStates(), + num_sequences = num_sequences_; + + // 'probs' is the matrix of pseudo-likelihoods for frame t - 1. + CuSubMatrix probs(exp_nnet_output_transposed_, 0, num_pdfs, + (t-1) * num_sequences_, num_sequences_); + const BaseFloat *prob_data = probs.Data(); + +#if HAVE_CUDA == 1 + if (CuDevice::Instantiate().Enabled()) { + Timer tim; + dim3 dimBlock(std::min(CU1DBLOCK, num_sequences), 1, 1); + dim3 dimGrid(n_blocks(num_sequences, dimBlock.x), num_hmm_states, 1); + + cuda_chain_hmm_forward(dimGrid, dimBlock, backward_transitions, transitions, + num_sequences, prob_data, probs.Stride(), + prev_alpha_dash, this_alpha); + + CU_SAFE_CALL(cudaGetLastError()); + CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); + } else +#endif + { + int32 prob_stride = probs.Stride(); + for (int32 h = 0; h < num_hmm_states; h++) { + for (int32 s = 0; s < num_sequences; s++) { + double this_tot_alpha = 0.0; + double this_tot_alpha_smbr = 0.0; + const DenominatorGraphTransition + *trans_iter = transitions + backward_transitions[h].first, + *trans_end = transitions + backward_transitions[h].second; + for (; trans_iter != trans_end; ++trans_iter) { + BaseFloat transition_prob = trans_iter->transition_prob; + int32 pdf_id = trans_iter->pdf_id, + prev_hmm_state = trans_iter->hmm_state; + BaseFloat prob = prob_data[pdf_id * prob_stride + s], + this_prev_alpha = prev_alpha_dash[prev_hmm_state * num_sequences + s], + this_prev_alpha_smbr = prev_alpha_dash_smbr[prev_hmm_state * num_sequences + s], + + this_tot_alpha += this_prev_alpha * transition_prob * prob; + this_tot_alpha_smbr += this_prev_alpha_smbr * transition_prob * prob + + (pdf_id == ref_pdf_id ? this_prev_alpha * prev_prev_alpha_dash[num_hmm_states * num_sequences + s] * transition_prob * prob : 0.0); + } + // Let arbitrary_scale be the inverse of the alpha-sum value that we + // store in the same place we'd store the alpha for the state numbered + // 'num_hmm_states'. We multiply this into all the + // transition-probabilities from the previous frame to this frame, in + // both the forward and backward passes, in order to keep the alphas in + // a good numeric range. This won't affect the posteriors, but when + // computing the total likelihood we'll need to compensate for it later + // on. + BaseFloat arbitrary_scale = + 1.0 / prev_alpha_dash[num_hmm_states * num_sequences + s]; + KALDI_ASSERT(this_tot_alpha - this_tot_alpha == 0); + KALDI_ASSERT(this_tot_alpha_smbr - this_tot_alpha_smbr == 0); + this_alpha[h * num_sequences + s] = this_tot_alpha * arbitrary_scale; + } + } + } +} + +BaseFloat DenominatorSmbrComputation::ForwardSmbr() { + AlphaSmbrFirstFrame(); + AlphaDash(0); + for (int32 t = 1; t <= frames_per_sequence_; t++) { + AlphaSmbrGeneralFrame(t); + AlphaDash(t); + } + return ComputeTotObjf(); +} + +BaseFloat DenominatorSmbrComputation::ComputeTotObjf() { + tot_prob_.Resize(num_sequences_); + // View the last alpha-dash as a matrix of size num-hmm-states by num-sequences. + CuSubMatrix last_alpha_dash( + alpha_.RowData(frames_per_sequence_), + den_graph_.NumStates(), + num_sequences_, + num_sequences_); + CuMatrix last_alpha_smbr_dash( + alpha_smbr_.RowData(frames_per_sequence_), + den_graph_.NumStates(), + num_sequences_, + num_sequences_); + + tot_prob_.AddRowSumMat(1.0, last_alpha_dash, 0.0); + // we should probably add an ApplyLog() function that takes a vector argument. + tot_log_prob_ = tot_prob_; + tot_log_prob_.ApplyLog(); + BaseFloat tot_log_prob = tot_log_prob_.Sum(); + + // We now have to add something for the arbitrary scaling factor. [note: the + // purpose of the arbitrary scaling factors was to keep things in a good + // floating-point range] + // The inverses of all the tot-alpha quantities, for t = 0 + // ... frames_per_sequence_ - 1, were included as the 'arbitrary factors' in + // the transition-probs, so we need to multiply them all together (not + // inversed) and add them as a correction term to the total log-likes. + // These tot-alpha quantities were stored in the same place that we would + // have stored the HMM-state numbered 'num_hmm_states'. + int32 num_hmm_states = den_graph_.NumStates(); + CuSubMatrix inv_arbitrary_scales( + alpha_, 0, frames_per_sequence_, + num_sequences_ * num_hmm_states, num_sequences_); + CuMatrix log_inv_arbitrary_scales( + inv_arbitrary_scales); + log_inv_arbitrary_scales.ApplyLog(); + BaseFloat log_inv_arbitrary_scales_product = + log_inv_arbitrary_scales.Sum(); + BaseFloat tot_objf = last_alpha_smbr_dash.Sum(); + return tot_objf; +} + + + +bool DenominatorSmbrComputation::BackwardSmbr( + BaseFloat deriv_weight, + CuMatrixBase *nnet_output_deriv) { + BetaSmbrDashLastFrame(); + BetaSmbr(frames_per_sequence_); + for (int32 t = frames_per_sequence_ - 1; t >= 0; t--) { + BetaSmbrDashGeneralFrame(t); + if (GetVerboseLevel() >= 1 || t == 0) + BetaSmbrGeneralFrameDebug(t); + Beta(t); + if (t % kMaxDerivTimeSteps == 0) { + // commit the derivative stored in exp_nnet_output_transposed_ by adding + // its transpose to the appropriate sub-matrix of 'nnet_output_deriv'. + int32 chunk_frames = std::min(static_cast(kMaxDerivTimeSteps), + frames_per_sequence_ - t), + num_pdfs = exp_nnet_output_transposed_.NumRows(); + CuSubMatrix transposed_deriv_part( + nnet_output_deriv_transposed_, + 0, num_pdfs, + 0, chunk_frames * num_sequences_); + CuSubMatrix output_deriv_part( + *nnet_output_deriv, + t * num_sequences_, chunk_frames * num_sequences_, + 0, num_pdfs); + output_deriv_part.AddMat(deriv_weight, transposed_deriv_part, kTrans); + if (t != 0) + transposed_deriv_part.SetZero(); + } + } + return ok_; +} + +void DenominatorSmbrComputation::BetaSmbrDashLastFrame() { + // sets up the beta-dash quantity on the last frame (frame == + // frames_per_sequence_). Note that the betas we use here contain a + // 1/(tot-prob) factor in order to simplify the backprop. + + int32 t = frames_per_sequence_; + BaseFloat *last_frame_beta_dash = beta_.RowData(t % 2); + BaseFloat *last_frame_beta_smbr_dash = beta_smbr_.RowData(t % 2); + + // create a 'fake matrix' - view this row as a matrix. + CuSubMatrix beta_dash_mat(last_frame_beta_dash, + den_graph_.NumStates(), + num_sequences_, + num_sequences_); + CuVector inv_tot_prob(tot_prob_); + inv_tot_prob.InvertElements(); + // the beta values at the end of the file only vary with the sequence-index, + // not with the HMM-index. We treat all states as having a final-prob of one. + beta_dash_mat.CopyRowsFromVec(inv_tot_prob); + CuSubMatrix beta_smbr_dash_mat(last_frame_beta_smbr_dash, + den_graph_.NumStates(), + num_sequences_, + num_sequences_); + beta_smbr_dash_mat.SetZero(); +} + +void DenominatorSmbrComputation::BetaSmbrDashGeneralFrame(int32 t) { + KALDI_ASSERT(t >= 0 && t < frames_per_sequence_); + int32 num_pdfs = exp_nnet_output_transposed_.NumRows(); + // t_wrapped gives us the time-index we use when indexing + // nnet_output_deriv_transposed_; to save memory we limit the size of the + // matrix, storing only chunks of frames at a time, and we add it to the + // non-transposed output whenever we finish a chunk. + int32 t_wrapped = t % static_cast(kMaxDerivTimeSteps); + const BaseFloat *this_alpha_dash = alpha_.RowData(t), + *this_alpha_smbr_dash = alpha_smbr_.RowData(t), + *next_beta = beta_.RowData((t + 1) % 2), + *next_betas_smbr = beta_smbr.RowData((t + 1) % 2); + BaseFloat *this_beta_dash = beta_.RowData(t % 2); + BaseFloat *this_beta_smbr_dash = beta_smbr_.RowData(t % 2); + const Int32Pair *forward_transitions = den_graph_.ForwardTransitions(); + const DenominatorGraphTransition *transitions = den_graph_.Transitions(); + // 'probs' is the matrix of pseudo-likelihoods for frame t. + CuSubMatrix probs(exp_nnet_output_transposed_, 0, num_pdfs, + t * num_sequences_, num_sequences_), + log_prob_deriv(nnet_output_deriv_transposed_, 0, num_pdfs, + t_wrapped * num_sequences_, num_sequences_); + + int32 num_hmm_states = den_graph_.NumStates(), + num_sequences = num_sequences_; + +#if HAVE_CUDA == 1 + if (CuDevice::Instantiate().Enabled()) { + Timer tim; + dim3 dimBlock(std::min(CU1DBLOCK, num_sequences), 1, 1); + dim3 dimGrid(n_blocks(num_sequences, dimBlock.x), num_hmm_states, 1); + cuda_chain_hmm_backward(dimGrid, dimBlock, forward_transitions, transitions, + num_sequences, probs.Data(), probs.Stride(), + this_alpha_dash, next_beta, this_beta_dash, + log_prob_deriv.Data(), log_prob_deriv.Stride()); + CU_SAFE_CALL(cudaGetLastError()); + CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); + } else +#endif + { + int32 prob_stride = probs.Stride(), + deriv_stride = log_prob_deriv.Stride(); + const BaseFloat *prob_data = probs.Data(); + BaseFloat *log_prob_deriv_data = log_prob_deriv.Data(); + for (int32 h = 0; h < num_hmm_states; h++) { + for (int32 s = 0; s < num_sequences; s++) { + BaseFloat this_alpha_dash_prob = this_alpha_dash[h * num_sequences + s], + this_alpha_smbr_dash_prob = this_alpha_smbr_dash[h * num_sequences + s], + inv_arbitrary_scale = + this_alpha_dash[num_hmm_states * num_sequences + s]; + double tot_variable_factor = 0.0, tot_smbr_variable_factor = 0.0; + BaseFloat occupation_factor = this_alpha_dash_prob / + inv_arbitrary_scale; + const DenominatorGraphTransition + *trans_iter = transitions + forward_transitions[h].first, + *trans_end = transitions + forward_transitions[h].second; + for (; trans_iter != trans_end; ++trans_iter) { + BaseFloat transition_prob = trans_iter->transition_prob; + int32 pdf_id = trans_iter->pdf_id, + next_hmm_state = trans_iter->hmm_state; + BaseFloat variable_factor = transition_prob * + next_beta[next_hmm_state * num_sequences + s] * + prob_data[pdf_id * prob_stride + s]; + BaseFloat smbr_variable_factor = (pdf_id == ref_pdf_id ? variable_factor : 0.0) + + transition_prob * prob_data[pdf_id * prob_stride + s] * next_beta_smbr[next_hmm_state * num_sequences + s]; + tot_variable_factor += variable_factor; + BaseFloat occupation_prob = variable_factor * occupation_factor; + gamma_smbr = smbr_varaible_factor * occupation_factor; + log_prob_deriv_data[pdf_id * deriv_stride + s] += occupation_prob; + } + this_beta_dash[h * num_sequences + s] = + tot_variable_factor / inv_arbitrary_scale; + this_beta_dash_smbr[h * num_sequences + s] = + tot_smbr_variable_factor / inv_arbitrary_scale; + } + } + } +} + +void DenominatorSmbrComputation::BetaGeneralFrameDebug(int32 t) { + BaseFloat num_hmm_states = den_graph_.NumStates(), + alpha_beta_size = num_hmm_states * num_sequences_; + CuSubVector this_alpha_dash(alpha_.RowData(t), alpha_beta_size), + this_beta_dash(beta_.RowData(t % 2), alpha_beta_size); + int32 t_wrapped = t % static_cast(kMaxDerivTimeSteps), + num_pdfs = exp_nnet_output_transposed_.NumRows(); + CuSubMatrix this_log_prob_deriv( + nnet_output_deriv_transposed_, 0, num_pdfs, + t_wrapped * num_sequences_, num_sequences_); + BaseFloat alpha_beta_product = VecVec(this_alpha_dash, + this_beta_dash), + this_log_prob_deriv_sum = this_log_prob_deriv.Sum(); + if (!ApproxEqual(alpha_beta_product, num_sequences_)) { + KALDI_WARN << "On time " << t << ", alpha-beta product " + << alpha_beta_product << " != " << num_sequences_ + << " alpha-dash-sum = " << this_alpha_dash.Sum() + << ", beta-dash-sum = " << this_beta_dash.Sum(); + if (fabs(alpha_beta_product - num_sequences_) > 2.0) { + KALDI_WARN << "Excessive error detected, will abandon this minibatch"; + ok_ = false; + } + } + // use higher tolerance, since we are using randomized pruning for the + // log-prob derivatives. + if (!ApproxEqual(this_log_prob_deriv_sum, + num_sequences_, 0.01)) { + KALDI_WARN << "On time " << t << ", log-prob-deriv sum " + << this_log_prob_deriv_sum << " != " << num_sequences_; + if (fabs(this_log_prob_deriv_sum - num_sequences_) > 2.0) { + KALDI_WARN << "Excessive error detected, will abandon this minibatch"; + ok_ = false; + } + } +} + + +} // namespace chain +} // namespace kaldi + diff --git a/src/chain/chain-denominator-smbr.h b/src/chain/chain-denominator-smbr.h new file mode 100644 index 00000000000..c34a4d1a4eb --- /dev/null +++ b/src/chain/chain-denominator-smbr.h @@ -0,0 +1,259 @@ +// chain/chain-denominator-smbr.h + +// Copyright 2015 Johns Hopkins University (Author: Daniel Povey) +// 2016 Vimal Manohar + + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + + +#ifndef KALDI_CHAIN_CHAIN_DENOMINATOR_SMBR_H_ +#define KALDI_CHAIN_CHAIN_DENOMINATOR_SMBR_H_ + +#include "chain/chain-denominator.h" + +namespace kaldi { +namespace chain { + +/* + This extended comment describes how we implement forward-backward without log + and without overflow, and also the leaky-HMM idea. + + We'll start by establishing the notation for conventional forward-backward, + then add the 'arbitrary-scale' concept that prevents overflow, and then + add the 'leaky-hmm' concept. + + All this is done in parallel over multiple sequences, but the computations + are independent over the separate sequences, so we won't introduce any notation + or index for the sequence; we'll just explain it for one sequences. + + Suppose we have I hmm-states, numbered i = 0 ... I-1 (we'll use i and j for + hmm-state indexes). Let foll(i) give a list of arcs leaving state i, and + pred(i) give a list of arcs entering state i, and we'll use notation like: + for (j, p, n) in foll(i): + for iterating over those arcs, where in this case j is the destination-state, + p is the transition-probability of the arc and n is the pdf-id index. + We can then look up the emission probability as x(t, n) for some frame + 0 <= t < T. + + ** Version 1 of the computation (naive version) ** + + * Forward computation (version 1) + + In the forward computation we're computing alpha(i, t) for 0 <= t <= T): + - For the first frame, set alpha(0, i) = init(i), where init(i) is the + initial-probabilitiy from state i. # in our framework these are obtained + # by running the HMM for a while and getting an averaged occupation + # probability, and using this as an initial-prob, since the boundaries of + # chunks don't really correspond to utterance boundaries in general.] + - For t = 1 ... T: + for i = 0 ... I-1: + alpha(t, i) = 0 + alpha_r(t, i) = 0 + for (j, p, n) in pred(i): # note: j is preceding-state. + alpha(t, i) += x(t-1, n) * alpha(t-1, j) * p + alpha_r(t, i) += alpha_r(t-1, j) * alpha(t-1, j) + x(t-1, n) * p * (ref_pdf == pdf ? 1.0 : 0.0) + alpha_r(t, i) /= alpha(t, i) + + - total-prob = \sum_i alpha(T, i). # note, we take the final-probs of all states + # to be 1.0. + - total-objf = \sum_i alpha(T, i) * alpha_r(T, i) / total-prob + + * Backward computation (version 1) + + And now for the backward computation. Contrary to tradition, we include the + inverse of the total-prob as a factor in the betas. This is both more + convenient (it simplifies the way we obtain posteriors), and makes the + algorithm more generalizable as all the beta quantities can be interpreted as + the partial derivative of the logprob with respect to their corresponding + alpha. + + In forward backward notation, gamma is normally used for state-level + occupation probabilities, but what we care about here is pdf-id-level + occupation probabilities (i.e. the partial derivative of the log-likelihood + w.r.t. the logs of the x(t, n) quantities), so we use gamma for that. + + - for the final frame: + for each i, beta(T, i) = 1 / total-prob. + for each i, beta_r(T, i) = 0 + - for t = T-1 ... 0: + for i = 0 ... I-1: + beta(t, i) = 0 + beta_r(t, i) = 0 + for (j, p, n) in foll(i): # note: j is following-state. + beta(t, i) += x(t, n) * beta(t+1, j) * p. + beta_r(t, i) += (ref_pdf == pdf ? x(t, n) * beta(t+1, j) * p : 0) + x(t, n) * p * beta_r(t+1, j). + gamma(t, n) += alpha(t, i) * x(t, n) * beta(t+1, j) * p. + + ** Version 2 of the computation (renormalized version) ** + + Version 1 of the algorithm is susceptible to numeric underflow and overflow, + due to the limited range of IEEE floating-point exponents. + Define tot-alpha(t) = \sum_i alpha(t, i). Then the renormalized version of + the computation is as above, except whenever the quantity x(t, n) appears, + we replace it with x(t, n) / tot-alpha(t). In the algorithm we refer to + 1.0 / tot-alpha(t) as 'arbitrary_scale', because mathematically we can use any + value here as long as we are consistent and the value only varies with t + and not with n; we'll always get the same posteriors (gamma). + + When the algorithm outputs log(total-prob) as the total log-probability + of the HMM, we have to instead return the expression: + log(total-prob) + \sum_{t=0}^{T-1} tot-alpha(t). + to correct for the scaling of the x values. + + The algorithm is still vulnerable to overflow in the beta computation because + it's possible that the dominant path could have a very tiny alpha. However, + once we introduce the leaky-HMM idea (below), this problem will disappear. + + ** Version 3 of the computation (leaky-HMM version) ** + + The leaky-HMM idea is intended to improve generalization by allowing paths + other than those explicitly allowed by the FST we compiled. Another way to + look at it is as a way of hedging our bets about where we split the utterance, + so it's as we're marginalizing over different splits of the utterance. You + could also think of it as a modification of the FST so that there is an + epsilon transition from each state to a newly added state, with probability + one, and then an epsilon transition from the newly added state to each state + with probability leaky-hmm-prob * init(i) [except we need a mechanism so that + no more than two epsilon transitions can be taken per frame- this would involve + creating two copies of the states] + + Recall that we mentioned that init(i) is the initial-probability of + HMM-state i, but these are obtained in such a way that they can be treated + as priors, or average occupation-probabilities. + + Anyway, the way we formulate leaky-hmm is as follows: + + * Forward computation (version 3) + + Let leaky-hmm-prob be a constant defined by the user, with 0.1 being a typical + value. It defines how much probability we give to the 'leaky' transitions. + + - For frame 0, set alpha(0, i) = init(i). + - For 0 <= t <= T, define tot-alpha(t) = \sum_i alpha(t, i). + - For 0 <= t <= T, define alpha'(t, i) = alpha(t, i) + tot-alpha(t) * leaky-hmm-prob * init(i). + + - For 1 <= t <= T, the computation of alpha(t, i) is as before except we use + the previous frame's alpha' instead of alpha. That is: + alpha(t, i) = 0 + for (j, p, n) in pred(i): # note: j is preceding-state. + alpha(t, i) += alpha'(t-1, j) * p * x(t-1, n) / tot-alpha(t-1) + + - total-prob = \sum_i alpha'(T, i) + + The corrected log-prob that we return from the algorithm will be + (total-prob + \sum_{t=0}^{T-1} tot-alpha(t)). + + * Backward computation (version 3) + + The backward computation is as follows. It is fairly straightforward to + derive if you think of it as an instance of backprop where beta, tot-beta and + beta' are the partial derivatives of the output log-prob w.r.t. the + corresponding alpha, tot-alpha and alpha' quantities. Note, tot-beta is not + really the sum of the betas as its name might suggest, it's just the + derivative w.r.t. tot-alpha. + + - beta'(T, i) = 1 / total-prob. + - for 0 <= t <= T, define tot-beta(t) = leaky-hmm-prob * \sum_i init(i) * beta'(t, i) + - for 0 <= t <= T, define beta(t, i) = beta'(t, i) + tot-beta(t). + - for 0 <= t < T, we compute beta'(t, i) and update gamma(t, n) as follows: + for 0 <= i < I: + beta'(t, i) = 0 + for (j, p, n) in foll(i): # note: j is following-state. + beta'(t, i) += beta(t+1, j) * p * x(t, n) / tot-alpha(t) + gamma(t, n) += alpha'(t, i) * beta(t+1, j) * p * x(t, n) / tot-alpha(t) + + Note: in the code, the tot-alpha and tot-beta quantities go in the same + memory location that the corresponding alpha and beta for state I would go. + + */ + +class DenominatorSmbrComputation : DenominatorComputation { + public: + /* + Constructor. 'nnet_output' is the raw nnet output (which we'll treat as + pseudo-log-likelihoods). + + @param [in] opts The options. + @param [in] graph The HMM that we use for the denominator (like a decoding graph, + with pdf-ids on the transitions). + @param [in] num_sequences The number of separate time sequences (all of the same length) + that we are working with. Must divide nnet_output.NumRows(). + @param [in] nnet_output The output of the neural network for this minibatch. + The rows must be ordered as (first frame of all sequences) + (second frame of all sequences), etc. + */ + DenominatorSmbrComputation(const ChainTrainingOptions &opts, + const DenominatorGraph &den_graph, + int32 num_sequences, + const CuMatrixBase &nnet_output); + + // Does the forward computation, and returns the total negated log-like summed + // over all sequences. You will have to scale this by any supervision + // weighting factor, manually. + BaseFloat ForwardSmbr(); + + // this adds deriv_weight times (the derivative of the log-prob w.r.t. the + // nnet output), to 'nnet_output_deriv'. + // returns true if everything seemed OK, false if a failure was detected. + bool BackwardSmbr(BaseFloat deriv_weight, + CuMatrixBase *nnet_output_deriv); + + private: + // Defining this constant as an enum is easier. it controls a memory/speed + // tradeoff, determining how many frames' worth of the transposed derivative + // we store at a time. It's not very critical; the only disadvantage from + // setting it small is that we have to invoke an AddMat kernel more times. + enum { kMaxDerivTimeSteps = 8 }; + + // sets up the alpha for frame t = 0. + void AlphaSmbrFirstFrame(); + // the alpha computation for some 0 < t <= num_time_steps_. + void AlphaSmbrGeneralFrame(int32 t); + // does the 'alpha-dash' computation for time t. this relates to + // 'leaky hmm'. + void AlphaSmbrDash(int32 t); + + // done after all the alphas, this function computes and returns the total + // smbr objective summed over all the sequences, and sets tot_prob_ (if we're + // doing correction) log_correction_term_. Note, this won't be scaled by + // 'deriv_scale' (which of course we haven't seen by the time this is called, + // from the Forward() computation). + BaseFloat ComputeTotObjf(); + + + void BetaSmbrDashLastFrame(); + // beta computation for 0 <= beta < num_time_steps_. + void BetaSmbrDashGeneralFrame(int32 t); + // compute the beta quantity from the beta-dash quantity (relates to leaky hmm). + void BetaSmbr(int32 t); + + // some checking that we can do if debug mode is activated, or on frame zero. + // Sets ok_ to false if a bad problem is detected. + void BetaSmbrGeneralFrameDebug(int32 t); + + CuMatrix alpha_smbr_; + + CuMatrix beta_smbr_; + + CuVector tot_smbr_; +}; + +} // namespace chain +} // namespace kaldi + +#endif // KALDI_CHAIN_CHAIN_DENOMINATOR_H_ + diff --git a/src/chain/chain-denominator.h b/src/chain/chain-denominator.h index b0f616673d6..1633c59cbb5 100644 --- a/src/chain/chain-denominator.h +++ b/src/chain/chain-denominator.h @@ -110,7 +110,7 @@ namespace chain { due to the limited range of IEEE floating-point exponents. Define tot-alpha(t) = \sum_i alpha(t, i). Then the renormalized version of the computation is as above, except whenever the quantity x(t, n) appears, - we replace it with x(t, n) / alpha(t). In the algorithm we refer to + we replace it with x(t, n) / tot-alpha(t). In the algorithm we refer to 1.0 / tot-alpha(t) as 'arbitrary_scale', because mathematically we can use any value here as long as we are consistent and the value only varies with t and not with n; we'll always get the same posteriors (gamma). @@ -305,6 +305,8 @@ class DenominatorComputation { CuVector log_correction_term_; bool ok_; + + friend DenominatorSmbrComputation; }; From 58fcd61ad773b7b5fe68b782e3f541fb0f388cbc Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Thu, 18 May 2017 13:58:40 -0400 Subject: [PATCH 002/174] long_utts: Minor fix --- egs/wsj/s5/steps/cleanup/decode_fmllr_segmentation.sh | 3 ++- egs/wsj/s5/steps/cleanup/decode_segmentation.sh | 3 ++- egs/wsj/s5/steps/cleanup/segment_long_utterances.sh | 2 +- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/egs/wsj/s5/steps/cleanup/decode_fmllr_segmentation.sh b/egs/wsj/s5/steps/cleanup/decode_fmllr_segmentation.sh index 1dfa74ab3f6..3f0d3f2fcbd 100755 --- a/egs/wsj/s5/steps/cleanup/decode_fmllr_segmentation.sh +++ b/egs/wsj/s5/steps/cleanup/decode_fmllr_segmentation.sh @@ -127,8 +127,9 @@ if [ $n1 != $n2 ]; then fi mkdir -p $dir/split_fsts +sort -k1,1 $graphdir/HCLG.fsts.scp > $dir/HCLG.fsts.sorted.scp utils/filter_scps.pl --no-warn -f 1 JOB=1:$nj \ - $sdata/JOB/feats.scp $graphdir/HCLG.fsts.scp $dir/split_fsts/HCLG.fsts.JOB.scp + $sdata/JOB/feats.scp $dir/HCLG.fsts.sorted.scp $dir/split_fsts/HCLG.fsts.JOB.scp HCLG=scp:$dir/split_fsts/HCLG.fsts.JOB.scp diff --git a/egs/wsj/s5/steps/cleanup/decode_segmentation.sh b/egs/wsj/s5/steps/cleanup/decode_segmentation.sh index e07105e8e08..c4887d60283 100755 --- a/egs/wsj/s5/steps/cleanup/decode_segmentation.sh +++ b/egs/wsj/s5/steps/cleanup/decode_segmentation.sh @@ -114,8 +114,9 @@ fi mkdir -p $dir/split_fsts +sort -k1,1 $graphdir/HCLG.fsts.scp > $dir/HCLG.fsts.sorted.scp utils/filter_scps.pl --no-warn -f 1 JOB=1:$nj \ - $sdata/JOB/feats.scp $graphdir/HCLG.fsts.scp $dir/split_fsts/HCLG.fsts.JOB.scp + $sdata/JOB/feats.scp $dir/HCLG.fsts.sorted.scp $dir/split_fsts/HCLG.fsts.JOB.scp HCLG=scp:$dir/split_fsts/HCLG.fsts.JOB.scp if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi diff --git a/egs/wsj/s5/steps/cleanup/segment_long_utterances.sh b/egs/wsj/s5/steps/cleanup/segment_long_utterances.sh index de1a04c3c23..30feedfdbd2 100755 --- a/egs/wsj/s5/steps/cleanup/segment_long_utterances.sh +++ b/egs/wsj/s5/steps/cleanup/segment_long_utterances.sh @@ -153,7 +153,7 @@ if [ $stage -le 3 ]; then # and then copy it to the sub-segments. mkdir -p $graph_dir - cat $dir/uniform_sub_segments | awk '{print $1" "$2}' | \ + sort -k1,2 $dir/uniform_sub_segments | awk '{print $1" "$2}' | \ utils/apply_map.pl -f 2 $dir/graphs/HCLG.fsts.scp > \ $graph_dir/HCLG.fsts.scp From d97edb6fe0705275bb5d97d1d000a664f3b1e7c9 Mon Sep 17 00:00:00 2001 From: Pegita Date: Thu, 18 May 2017 18:19:07 -0400 Subject: [PATCH 003/174] added Transfer learning setup using nnet3+chain+tdnn for WSJ->RM. --- egs/rm/s5/RESULTS | 3 + egs/rm/s5/local/chain/run_tdnn_wsj_rm.sh | 184 ++++++++++++++++++ egs/rm/s5/local/online/run_nnet2_common.sh | 45 +++-- .../nnet3/train/chain_objf/acoustic_model.py | 14 +- egs/wsj/s5/steps/libs/nnet3/train/common.py | 3 +- egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py | 59 +++++- egs/wsj/s5/steps/libs/nnet3/xconfig/utils.py | 8 + egs/wsj/s5/steps/nnet3/chain/train.py | 8 +- egs/wsj/s5/steps/nnet3/train_dnn.py | 21 +- egs/wsj/s5/steps/nnet3/xconfig_to_configs.py | 43 +++- src/nnet3/nnet-component-itf.h | 5 + src/nnet3/nnet-utils.cc | 26 +++ src/nnet3/nnet-utils.h | 3 + 13 files changed, 382 insertions(+), 40 deletions(-) create mode 100755 egs/rm/s5/local/chain/run_tdnn_wsj_rm.sh diff --git a/egs/rm/s5/RESULTS b/egs/rm/s5/RESULTS index ecafb588cfe..368abd2751f 100644 --- a/egs/rm/s5/RESULTS +++ b/egs/rm/s5/RESULTS @@ -234,6 +234,9 @@ for x in exp/nnet2_online_wsj/nnet_ms_a_smbr_0.00005/1/decode_*; do grep WER $x/ %WER 2.86 [ 358 / 12533, 46 ins, 61 del, 251 sub ] exp/chain/tdnn_5g/decode/wer_5_0.0 %WER 2.71 [ 340 / 12533, 58 ins, 59 del, 223 sub ] exp/chain/tdnn_5n/decode/wer_4_0.0 +### WSJ->RM Transfer learning using chain model ### +%WER 2.21 [ 277 / 12533, 42 ins, 45 del, 190 sub ] exp/chain/tdnn_wsj_rm/decode/wer_3_0.5 + ### nnet1 results ### # dnn4b, MFCC,LDA,fMLLR feaures, (Karel - 30.7.2015) diff --git a/egs/rm/s5/local/chain/run_tdnn_wsj_rm.sh b/egs/rm/s5/local/chain/run_tdnn_wsj_rm.sh new file mode 100755 index 00000000000..9d032aab9f3 --- /dev/null +++ b/egs/rm/s5/local/chain/run_tdnn_wsj_rm.sh @@ -0,0 +1,184 @@ +#!/bin/bash +set -e + +# configs for 'chain' +stage=0 +train_stage=-10 +get_egs_stage=-10 +dir=exp/chain/tdnn_wsj_rm + +# training options +num_epochs=12 +initial_effective_lrate=0.005 +final_effective_lrate=0.0005 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=2 +num_jobs_final=4 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false +xent_regularize=0.1 + +# configs for transfer learning +srcdir=../../wsj/s5/ +common_egs_dir= +src_mdl=$srcdir/exp/chain/tdnn1d_sp/final.mdl +primary_lr_factor=0.25 +dim=450 +nnet_affix=_online +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 6 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 1200 data/train $lang $ali_dir $treedir +fi + +if [ $stage -le 7 ]; then + echo "$0: creating neural net configs using the xconfig parser for"; + echo "extra layers w.r.t source network."; + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + mkdir -p $dir + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + relu-renorm-layer name=tdnn7-target input=Append(tdnn6.renorm@-3,tdnn6.renorm@0) dim=$dim + ## adding the layers for chain branch + relu-renorm-layer name=prefinal-chain-target input=tdnn7-target dim=$dim target-rms=0.5 + output-layer name=output-target include-log-softmax=false dim=$num_targets max-change=1.5 + relu-renorm-layer name=prefinal-xent-target input=tdnn7-target dim=$dim target-rms=0.5 + output-layer name=output-xent-target dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + # edits.config contains edits required to train transferred model. + # e.g. substitute output-node of previous model with new output + # and removing orphan nodes and components. + cat < $dir/configs/edits.config + remove-output-nodes name=output + remove-output-nodes name=output-xent + rename-node old-name=output-target new-name=output + rename-node old-name=output-xent-target new-name=output-xent + remove-orphans +EOF + steps/nnet3/xconfig_to_configs.py --existing-model $src_mdl \ + --xconfig-file $dir/configs/network.xconfig \ + --edits-config $dir/configs/edits.config \ + --config-dir $dir/configs/ +fi + +if [ $stage -le 8 ]; then + echo "$0: generate egs for chain to train new model on rm dataset." + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/rm-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + echo "$0: set the learning-rate-factor for initial network to be zero." + nnet3-am-copy --raw=true --edits="set-learning-rate-factor name=* learning-rate-factor=$primary_lr_factor" \ + $src_mdl $dir/init.raw || exit 1; + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet2${nnet_affix}/ivectors \ + --chain.xent-regularize $xent_regularize \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize 0.1 \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=200" \ + --egs.dir "$common_egs_dir" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch=$minibatch_size \ + --trainer.frames-per-iter 1000000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial=$num_jobs_initial \ + --trainer.optimization.num-jobs-final=$num_jobs_final \ + --trainer.optimization.initial-effective-lrate=$initial_effective_lrate \ + --trainer.optimization.final-effective-lrate=$final_effective_lrate \ + --trainer.max-param-change $max_param_change \ + --cleanup.remove-egs false \ + --feat-dir data/train_hires \ + --tree-dir $treedir \ + --lat-dir exp/tri3b_lats \ + --dir $dir || exit 1; +fi + +if [ $stage -le 9 ]; then + steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 4 \ + data/test $srcdir/exp/nnet3/extractor exp/nnet2${nnet_affix}/ivectors_test || exit 1; +fi + +if [ $stage -le 10 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --scoring-opts "--min-lmwt 1" \ + --nj 20 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet2${nnet_affix}/ivectors_test \ + $dir/graph data/test_hires $dir/decode || exit 1; +fi + +if [ $stage -le 11 ]; then + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_ug $dir $dir/graph_ug + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 20 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet2${nnet_affix}/ivectors_test \ + $dir/graph_ug data/test_hires $dir/decode_ug || exit 1; +fi +wait; +exit 0; diff --git a/egs/rm/s5/local/online/run_nnet2_common.sh b/egs/rm/s5/local/online/run_nnet2_common.sh index 1cd8abfba54..27ad7cf9aeb 100755 --- a/egs/rm/s5/local/online/run_nnet2_common.sh +++ b/egs/rm/s5/local/online/run_nnet2_common.sh @@ -4,7 +4,10 @@ stage=1 - +nnet_affix=_online +extractor=exp/nnet2${nnet_affix}/extractor +ivector_dim=50 +mfcc_config=conf/mfcc_hires.conf . cmd.sh . ./path.sh . ./utils/parse_options.sh @@ -26,22 +29,38 @@ else num_threads=16 minibatch_size=128 parallel_opts="--num-threads $num_threads" - dir=exp/nnet2_online/nnet + dir=exp/nnet2${nnet_affix}/nnet fi +train_set=train +if [ $stage -le 0 ]; then + echo "$0: creating high-resolution MFCC features." + mfccdir=data/${train_set}_hires/data + + for datadir in $train_set test; do + utils/copy_data_dir.sh data/$datadir data/${datadir}_hires -if [ $stage -le 1 ]; then - mkdir -p exp/nnet2_online - steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 10 --num-frames 200000 \ - data/train 256 exp/tri3b exp/nnet2_online/diag_ubm + steps/make_mfcc.sh --nj 30 --mfcc-config $mfcc_config \ + --cmd "$train_cmd" data/${datadir}_hires || exit 1; + steps/compute_cmvn_stats.sh data/${datadir}_hires + utils/fix_data_dir.sh data/${datadir}_hires + done fi -if [ $stage -le 2 ]; then - # use a smaller iVector dim (50) than the default (100) because RM has a very - # small amount of data. - steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 4 \ - --ivector-dim 50 \ - data/train exp/nnet2_online/diag_ubm exp/nnet2_online/extractor || exit 1; +if [ ! -f $extractor/final.dubm ]; then + if [ $stage -le 1 ]; then + mkdir -p exp/nnet2${nnet_affix} + steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 10 --num-frames 200000 \ + data/train 256 exp/tri3b exp/nnet2${nnet_affix}/diag_ubm + fi + + if [ $stage -le 2 ]; then + # use a smaller iVector dim (50) than the default (100) because RM has a very + # small amount of data. + steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 4 \ + --ivector-dim $ivector_dim \ + data/train exp/nnet2${nnet_affix}/diag_ubm $extractor || exit 1; + fi fi if [ $stage -le 3 ]; then @@ -50,5 +69,5 @@ if [ $stage -le 3 ]; then steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2 data/train data/train_max2 steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 4 \ - data/train_max2 exp/nnet2_online/extractor exp/nnet2_online/ivectors || exit 1; + data/train_max2 $extractor exp/nnet2${nnet_affix}/ivectors || exit 1; fi diff --git a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py index f28aa89774e..80a1f0154bc 100644 --- a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py +++ b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py @@ -455,6 +455,19 @@ def prepare_initial_acoustic_model(dir, run_opts, srand=-1): common_train_lib.prepare_initial_network(dir, run_opts, srand=srand) + # edits 0.raw using edits.config before adding transition model. + edits_config_file = "{0}/configs/edits.config".format(dir) + if os.path.exists(edits_config_file): + logger.info("edits 0.raw model using {0}/configs/edits.config." + "".format(dir)) + common_lib.run_job( + """{command} {dir}/log/edit.log \ + nnet3-copy --edits-config={edits_config} {dir}/0.raw \ + {dir}/0.raw + """.format(command=run_opts.command, + dir=dir, + edits_config=edits_config_file)) + # The model-format for a 'chain' acoustic model is just the transition # model and then the raw nnet, so we can use 'cat' to create this, as # long as they have the same mode (binary or not binary). @@ -466,7 +479,6 @@ def prepare_initial_acoustic_model(dir, run_opts, srand=-1): nnet3-am-init {dir}/0.trans_mdl {dir}/0.raw \ {dir}/0.mdl""".format(command=run_opts.command, dir=dir)) - def compute_train_cv_probabilities(dir, iter, egs_dir, left_context, right_context, l2_regularize, xent_regularize, leaky_hmm_coefficient, diff --git a/egs/wsj/s5/steps/libs/nnet3/train/common.py b/egs/wsj/s5/steps/libs/nnet3/train/common.py index e6ef511e7f2..b913acb2627 100644 --- a/egs/wsj/s5/steps/libs/nnet3/train/common.py +++ b/egs/wsj/s5/steps/libs/nnet3/train/common.py @@ -444,7 +444,8 @@ def smooth_presoftmax_prior_scale_vector(pdf_counts, def prepare_initial_network(dir, run_opts, srand=-3): - if os.path.exists(dir+"/configs/init.config"): + if os.path.exists(dir+"/configs/init.config") or os.path.exists( + "{0}/init.raw".format(dir)): common_lib.run_job( """{command} {dir}/log/add_first_layer.log \ nnet3-init --srand={srand} {dir}/init.raw \ diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py index 918d8bd2fb2..9e2c1576147 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py @@ -10,7 +10,7 @@ import sys import libs.nnet3.xconfig.layers as xlayers import libs.nnet3.xconfig.utils as xutils - +import libs.common as common_lib # We have to modify this dictionary when adding new layers @@ -49,11 +49,63 @@ def xconfig_line_to_object(config_line, prev_layers = None): "*** {0}".format(config_line), file=sys.stderr) raise e +# This function reads existing model file with nnet3 format and returns it as +# list of layers with name and dimension to be used as auxilary information +# to generate xconfig. +def read_model(model_filename): + all_layers = [] + try: + f = open(model_filename, 'r') + except Exeption as e: + sys.exit("{0}: error reading model file '{1}'".format(sys.argv[0], model_filename, repr(e))) + + # use nnet3-info to get component names in the model. + out, err = common_lib.run_kaldi_command("nnet3-info {0} | grep '\-node' " + "".format(model_filename)) + + # out contains all component-nodes used in model_filename + layer_names = [] + for line in out.split("\n"): + parts = line.split(" ") + input_dim = -1 + output_dim = -1 + for field in parts: + key_value = field.split("=") + if len(key_value) == 2: + key = key_value[0] + value = key_value[1] + if key == "name": + layer_name = value + #layer_name, auxiliary_output = xutils.split_layer_name(value) + elif key == "input-dim": + input_dim = int(value) + elif key == "output-dim": + output_dim = int(value) + elif key == "input": + input_str = value + + if layer_name is not None and layer_name not in layer_names: + key_to_value = dict() + layer_names.append(layer_name) + key_to_value['name'] = layer_name + if input_dim != -1: + if output_dim == -1: + # The layer is input layer type. + key_to_value['dim'] = input_dim + elif input_str is not None: + key_to_value['dim'] = output_dim + all_layers.append(xlayers.XconfigInputLayer('input', key_to_value, all_layers)) + if len(all_layers) == 0: + raise RuntimeError("{0}: model filename '{1}' is empty.".format( + sys.argv[0], model_filename)) + f.close() + return all_layers + # This function reads an xconfig file and returns it as a list of layers # (usually we use the variable name 'all_layers' elsewhere for this). # It will die if the xconfig file is empty or if there was # some error parsing it. -def read_xconfig_file(xconfig_filename): +def read_xconfig_file(xconfig_filename, aux_layers=[]): try: f = open(xconfig_filename, 'r') except Exception as e: @@ -66,10 +118,11 @@ def read_xconfig_file(xconfig_filename): break # the next call will raise an easy-to-understand exception if # it fails. - this_layer = xconfig_line_to_object(line, all_layers) + this_layer = xconfig_line_to_object(line, aux_layers) if this_layer is None: continue # line was blank after removing comments. all_layers.append(this_layer) + aux_layers.append(this_layer) if len(all_layers) == 0: raise RuntimeError("{0}: xconfig file '{1}' is empty".format( sys.argv[0], xconfig_filename)) diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/utils.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/utils.py index 3d958568717..e80a51a85ef 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/utils.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/utils.py @@ -60,6 +60,10 @@ def get_dim_from_layer_name(all_layers, current_layer, full_layer_name): for layer in all_layers: if layer is current_layer: break + # if "." used in layer name + if layer.get_name() == full_layer_name: + return layer.output_dim() + if layer.get_name() == layer_name: if not auxiliary_output in layer.auxiliary_outputs() and auxiliary_output is not None: raise RuntimeError("Layer '{0}' has no such auxiliary output: '{1}' ({0}.{1})".format(layer_name, auxiliary_output)) @@ -85,6 +89,10 @@ def get_string_from_layer_name(all_layers, current_layer, full_layer_name): for layer in all_layers: if layer is current_layer: break + # full_layer_name with "." + if layer.get_name() == full_layer_name: + return layer.output_name() + if layer.get_name() == layer_name: if not auxiliary_output in layer.auxiliary_outputs() and auxiliary_output is not None: raise RuntimeError("Layer '{0}' has no such auxiliary output: '{1}' ({0}.{1})".format( diff --git a/egs/wsj/s5/steps/nnet3/chain/train.py b/egs/wsj/s5/steps/nnet3/chain/train.py index 19276817ea0..1d70d0a57d8 100755 --- a/egs/wsj/s5/steps/nnet3/chain/train.py +++ b/egs/wsj/s5/steps/nnet3/chain/train.py @@ -310,7 +310,8 @@ def train(args, run_opts, background_process_handler): logger.info("Creating denominator FST") chain_lib.create_denominator_fst(args.dir, args.tree_dir, run_opts) - if (args.stage <= -4): + if (args.stage <= -4) and os.path.exists("{dir}/config/init.config".format( + dir=args.dir)): logger.info("Initializing a basic network for estimating " "preconditioning matrix") common_lib.run_kaldi_command( @@ -358,7 +359,7 @@ def train(args, run_opts, background_process_handler): [egs_left_context, egs_right_context, frames_per_eg_str, num_archives] = ( - common_train_lib.verify_egs_dir(egs_dir, feat_dim, + common_train_lib.verify_egs_dir(egs_dir, feat_dim, ivector_dim, ivector_id, egs_left_context, egs_right_context, egs_left_context_initial, @@ -375,7 +376,8 @@ def train(args, run_opts, background_process_handler): logger.info("Copying the properties from {0} to {1}".format(egs_dir, args.dir)) common_train_lib.copy_egs_properties_to_exp_dir(egs_dir, args.dir) - if (args.stage <= -2): + if (args.stage <= -2) and os.path.exists("{dir}/config/init.config".format( + dir=args.dir)): logger.info('Computing the preconditioning matrix for input features') chain_lib.compute_preconditioning_matrix( diff --git a/egs/wsj/s5/steps/nnet3/train_dnn.py b/egs/wsj/s5/steps/nnet3/train_dnn.py index 2f324512114..e3d5c735101 100755 --- a/egs/wsj/s5/steps/nnet3/train_dnn.py +++ b/egs/wsj/s5/steps/nnet3/train_dnn.py @@ -199,14 +199,17 @@ def train(args, run_opts, background_process_handler): # we do this as it's a convenient way to get the stats for the 'lda-like' # transform. - if (args.stage <= -5) and os.path.exists(args.dir+"/configs/init.config"): - logger.info("Initializing a basic network for estimating " - "preconditioning matrix") - common_lib.run_job( - """{command} {dir}/log/nnet_init.log \ - nnet3-init --srand=-2 {dir}/configs/init.config \ - {dir}/init.raw""".format(command=run_opts.command, - dir=args.dir)) + if (args.stage <= -5): + if os.path.exists(args.dir+"/configs/init.config"): + logger.info("Initializing a basic network for estimating " + "preconditioning matrix") + common_lib.run_job( + """{command} {dir}/log/nnet_init.log \ + nnet3-init --srand=-2 {dir}/configs/init.config \ + {dir}/init.raw""".format(command=run_opts.command, + dir=args.dir)) + else: + assert(os.path.exists(args.dir+"/init.raw")) default_egs_dir = '{0}/egs'.format(args.dir) if (args.stage <= -4) and args.egs_dir is None: @@ -232,7 +235,7 @@ def train(args, run_opts, background_process_handler): [egs_left_context, egs_right_context, frames_per_eg_str, num_archives] = ( - common_train_lib.verify_egs_dir(egs_dir, feat_dim, + common_train_lib.verify_egs_dir(egs_dir, feat_dim, ivector_dim, ivector_id, left_context, right_context)) assert(str(args.frames_per_eg) == frames_per_eg_str) diff --git a/egs/wsj/s5/steps/nnet3/xconfig_to_configs.py b/egs/wsj/s5/steps/nnet3/xconfig_to_configs.py index 7e876bda1ed..2b6992bb86d 100755 --- a/egs/wsj/s5/steps/nnet3/xconfig_to_configs.py +++ b/egs/wsj/s5/steps/nnet3/xconfig_to_configs.py @@ -28,6 +28,15 @@ def get_args(): epilog='Search egs/*/*/local/{nnet3,chain}/*sh for examples') parser.add_argument('--xconfig-file', required=True, help='Filename of input xconfig file') + parser.add_argument('--existing-model', + help='The component nodes in this model can be ' + 'used as input to generate new config file. ' + 'e.g. generate new model using nodes in ' + 'existing model.') + parser.add_argument('--edits-config', + help='This is applied to the raw model before ' + 'computing model context and back_compatibility. ' + 'e.g. renaming output-nodes.') parser.add_argument('--config-dir', required=True, help='Directory to write config files and variables') @@ -209,11 +218,17 @@ def write_config_files(config_dir, all_layers): raise -def add_back_compatibility_info(config_dir): +def add_back_compatibility_info(config_dir, existing_model=None, + edits_config=None): """This will be removed when python script refactoring is done.""" - - common_lib.run_kaldi_command("nnet3-init {0}/ref.config " - "{0}/ref.raw".format(config_dir)) + raw_model = "{0}/ref.raw".format(config_dir) + if edits_config is not None: + raw_model = " - | nnet3-copy --edits-config={0} - {1}".format( + edits_config, raw_model) + common_lib.run_kaldi_command("nnet3-init {0} {1}/ref.config " + "{2}".format(existing_model if + existing_model is not None else "", + config_dir, raw_model)) out, err = common_lib.run_kaldi_command("nnet3-info {0}/ref.raw | " "head -4".format(config_dir)) # out looks like this @@ -241,13 +256,17 @@ def add_back_compatibility_info(config_dir): common_lib.force_symlink("final.config".format(config_dir), "{0}/layer1.config".format(config_dir)) -def check_model_contexts(config_dir): +def check_model_contexts(config_dir, existing_model=None, edits_config=None): contexts = {} for file_name in ['init', 'ref']: if os.path.exists('{0}/{1}.config'.format(config_dir, file_name)): contexts[file_name] = {} - common_lib.run_kaldi_command("nnet3-init {0}/{1}.config " - "{0}/{1}.raw".format(config_dir, file_name)) + raw_model = "{0}/{1}.raw".format(config_dir, file_name) + if edits_config is not None: + raw_model = " - | nnet3-copy --edits-config={0} - {1}".format( + edits_config, raw_model) + common_lib.run_kaldi_command("nnet3-init {0} {1}/{2}.config " + "{3}".format(existing_model if existing_model is not None else "", config_dir, file_name, raw_model)) out, err = common_lib.run_kaldi_command("nnet3-info {0}/{1}.raw | " "head -4".format(config_dir, file_name)) # out looks like this @@ -280,11 +299,15 @@ def check_model_contexts(config_dir): def main(): args = get_args() backup_xconfig_file(args.xconfig_file, args.config_dir) - all_layers = xparser.read_xconfig_file(args.xconfig_file) + aux_layers = [] + if args.existing_model is not None: + aux_layers = xparser.read_model(args.existing_model) + all_layers = xparser.read_xconfig_file(args.xconfig_file, aux_layers) write_expanded_xconfig_files(args.config_dir, all_layers) write_config_files(args.config_dir, all_layers) - check_model_contexts(args.config_dir) - add_back_compatibility_info(args.config_dir) + check_model_contexts(args.config_dir, args.existing_model, args.edits_config) + add_back_compatibility_info(args.config_dir, args.existing_model, + args.edits_config) if __name__ == '__main__': diff --git a/src/nnet3/nnet-component-itf.h b/src/nnet3/nnet-component-itf.h index 7cf438a025e..5b40576b495 100644 --- a/src/nnet3/nnet-component-itf.h +++ b/src/nnet3/nnet-component-itf.h @@ -405,6 +405,11 @@ class UpdatableComponent: public Component { /// learning_rate_factor_. virtual void SetAsGradient() { learning_rate_ = 1.0; is_gradient_ = true; } + // Sets the learning rate factors to set to this value. + virtual void SetLearningRateFactor(BaseFloat lrate_factor) { + learning_rate_factor_ = lrate_factor; + } + /// Gets the learning rate of gradient descent. Note: if you call /// SetLearningRate(x), and learning_rate_factor_ != 1.0, /// a different value than x will returned. diff --git a/src/nnet3/nnet-utils.cc b/src/nnet3/nnet-utils.cc index 27415fe8775..28fc575c910 100644 --- a/src/nnet3/nnet-utils.cc +++ b/src/nnet3/nnet-utils.cc @@ -584,6 +584,32 @@ void ReadEditConfig(std::istream &edit_config_is, Nnet *nnet) { } } KALDI_LOG << "Set learning rates for " << num_learning_rates_set << " nodes."; + } else if (directive == "set-learning-rate-factor") { + std::string name_pattern = "*"; + // name_pattern defaults to '*' if non is given. + config_line.GetValue("name", &name_pattern); + BaseFloat learning_rate_factor = -1; + if (!config_line.GetValue("learning-rate-factor", &learning_rate_factor)) { + KALDI_ERR << "In edits-config, expected learning-rate-factor to be set in line: " + << config_line.WholeLine(); + } + // Note: the learning_rate_factor_ defined in the component + // sets to the value you provided, so if you call SetUnderlyingLearningRate(), + // the actual learning rate (learning_rate_) is set to the value you provided + // times learning_rate. + UpdatableComponent *component = NULL; + int32 num_learning_rate_factors_set = 0; + for (int32 c = 0; c < nnet->NumComponents(); c++) { + if (NameMatchesPattern(nnet->GetComponentName(c).c_str(), + name_pattern.c_str()) && + (component = + dynamic_cast(nnet->GetComponent(c)))) { + component->SetLearningRateFactor(learning_rate_factor); + num_learning_rate_factors_set++; + } + } + KALDI_LOG << "Set learning rate factors for " << num_learning_rate_factors_set + << " nodes."; } else if (directive == "rename-node") { // this is a shallow renaming of a node, and it requires that the name used is // not the name of another node. diff --git a/src/nnet3/nnet-utils.h b/src/nnet3/nnet-utils.h index 041a916fb69..203e537dc13 100644 --- a/src/nnet3/nnet-utils.h +++ b/src/nnet3/nnet-utils.h @@ -204,6 +204,9 @@ void FindOrphanNodes(const Nnet &nnet, std::vector *nodes); set-learning-rate [name=] learning-rate= Sets the learning rate for any updatable nodes matching the name pattern. + set-learning-rate-factor [name=] learning-rate-factor= + Sets the learning rate factor for any updatable nodes matching the name pattern. + rename-node old-name= new-name= Renames a node; this is a surface renaming that does not affect the structure (for structural changes, use the regular config file format, not the From b5fe795891da8406a622f95e204e553340aa23fe Mon Sep 17 00:00:00 2001 From: Pegita Date: Mon, 22 May 2017 17:34:55 -0400 Subject: [PATCH 004/174] fixed some issues w.r.t comments. --- .../s5/local/nnet3/run_tdnn_multilingual.sh | 4 +- egs/rm/s5/local/chain/run_tdnn_wsj_rm.sh | 10 ++++- egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py | 11 +++-- egs/wsj/s5/steps/nnet3/chain/train.py | 5 +-- egs/wsj/s5/steps/nnet3/xconfig_to_configs.py | 40 +++++++++---------- 5 files changed, 39 insertions(+), 31 deletions(-) diff --git a/egs/babel_multilang/s5/local/nnet3/run_tdnn_multilingual.sh b/egs/babel_multilang/s5/local/nnet3/run_tdnn_multilingual.sh index ccb8a68030f..b2f30c62d38 100755 --- a/egs/babel_multilang/s5/local/nnet3/run_tdnn_multilingual.sh +++ b/egs/babel_multilang/s5/local/nnet3/run_tdnn_multilingual.sh @@ -181,7 +181,6 @@ if [ $stage -le 8 ]; then cat < $dir/configs/network.xconfig input dim=$ivector_dim name=ivector input dim=$feat_dim name=input - output name=output-tmp input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) # please note that it is important to have input layer with the name=input # as the layer immediately preceding the fixed-affine-layer to enable @@ -209,10 +208,9 @@ EOF EOF steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig \ --config-dir $dir/configs/ \ - --nnet-edits=$dir/configs/edits.config + --edits-config=$dir/configs/edits.config cat <> $dir/configs/vars -add_lda=false include_log_softmax=false EOF diff --git a/egs/rm/s5/local/chain/run_tdnn_wsj_rm.sh b/egs/rm/s5/local/chain/run_tdnn_wsj_rm.sh index 46a8d710b9b..1e78f2b90ec 100755 --- a/egs/rm/s5/local/chain/run_tdnn_wsj_rm.sh +++ b/egs/rm/s5/local/chain/run_tdnn_wsj_rm.sh @@ -1,4 +1,12 @@ #!/bin/bash + +# This script uses weight transfer as Transfer learning method +# and use already trained model on wsj and remove the last layer and +# add new randomly initialized layer and retrain the whole network. +# while training new added layer using rm data. +# The chain config is as run_tdnn_5n.sh and the result is: +#System tdnn_5n tdnn_wsj_rm +#WER 2.71 2.21 set -e # configs for 'chain' @@ -114,7 +122,7 @@ EOF EOF steps/nnet3/xconfig_to_configs.py --existing-model $src_mdl \ --xconfig-file $dir/configs/network.xconfig \ - --nnet-edits $dir/configs/edits.config \ + --edits-config $dir/configs/edits.config \ --config-dir $dir/configs/ fi diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py index 9e2c1576147..3540d5197f5 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py @@ -49,10 +49,11 @@ def xconfig_line_to_object(config_line, prev_layers = None): "*** {0}".format(config_line), file=sys.stderr) raise e -# This function reads existing model file with nnet3 format and returns it as -# list of layers with name and dimension to be used as auxilary information -# to generate xconfig. -def read_model(model_filename): +# This reads raw existing model (*.raw) and returns array of +# XconfigInputLayer one per input-node or component-node with same 'name' used +# in raw model and 'dim' equal to 'output-dim' for component-node and input-dim for +# input-node. +def get_model_component_info(model_filename): all_layers = [] try: f = open(model_filename, 'r') @@ -105,6 +106,8 @@ def read_model(model_filename): # (usually we use the variable name 'all_layers' elsewhere for this). # It will die if the xconfig file is empty or if there was # some error parsing it. +# aux_layers is a list of auxilary layers(component-nodes or input-node) that +# can be used as input to component-nodes used in xconfig_file. def read_xconfig_file(xconfig_filename, aux_layers=[]): try: f = open(xconfig_filename, 'r') diff --git a/egs/wsj/s5/steps/nnet3/chain/train.py b/egs/wsj/s5/steps/nnet3/chain/train.py index 1d70d0a57d8..5cb9b531c5d 100755 --- a/egs/wsj/s5/steps/nnet3/chain/train.py +++ b/egs/wsj/s5/steps/nnet3/chain/train.py @@ -285,7 +285,6 @@ def train(args, run_opts, background_process_handler): model_right_context = variables['model_right_context'] # this is really the number of times we add layers to the network for # discriminative pretraining - num_hidden_layers = variables['num_hidden_layers'] except KeyError as e: raise Exception("KeyError {0}: Variables need to be defined in " "{1}".format(str(e), '{0}/configs'.format(args.dir))) @@ -310,7 +309,7 @@ def train(args, run_opts, background_process_handler): logger.info("Creating denominator FST") chain_lib.create_denominator_fst(args.dir, args.tree_dir, run_opts) - if (args.stage <= -4) and os.path.exists("{dir}/config/init.config".format( + if (args.stage <= -4) and os.path.exists("{dir}/configs/init.config".format( dir=args.dir)): logger.info("Initializing a basic network for estimating " "preconditioning matrix") @@ -376,7 +375,7 @@ def train(args, run_opts, background_process_handler): logger.info("Copying the properties from {0} to {1}".format(egs_dir, args.dir)) common_train_lib.copy_egs_properties_to_exp_dir(egs_dir, args.dir) - if (args.stage <= -2) and os.path.exists("{dir}/config/init.config".format( + if (args.stage <= -2) and os.path.exists("{dir}/configs/init.config".format( dir=args.dir)): logger.info('Computing the preconditioning matrix for input features') diff --git a/egs/wsj/s5/steps/nnet3/xconfig_to_configs.py b/egs/wsj/s5/steps/nnet3/xconfig_to_configs.py index 47b03c9aa33..de92c03e659 100755 --- a/egs/wsj/s5/steps/nnet3/xconfig_to_configs.py +++ b/egs/wsj/s5/steps/nnet3/xconfig_to_configs.py @@ -29,19 +29,21 @@ def get_args(): parser.add_argument('--xconfig-file', required=True, help='Filename of input xconfig file') parser.add_argument('--existing-model', - help='This option is useful in case of ' - 'using component nodes in other network ' + help='Filename of previously trained neural net ' + '(e.g. final.mdl) which is useful in case of ' + 'using list of component-node in already trained model ' 'to generate new config file for new model.' - 'e.g. Transfer learning: generate new model using ' + 'e.g. In Transfer learning: generate new model using ' 'nodes in existing model.') parser.add_argument('--config-dir', required=True, help='Directory to write config files and variables') - parser.add_argument('--nnet-edits', type=str, default=None, + parser.add_argument('--edits-config', type=str, default=None, action=common_lib.NullstrToNoneAction, - help="This option is useful in case the network you are " + help="This is nnet3 config filename that is useful in " + "case the network you are " "creating does not have an output node called 'output' " "(e.g. for multilingual setups). You can set this to " - "an edit-string like: " + "an edits-config can contain string like: " "'rename-node old-name=xxx new-name=output' " "if node xxx plays the role of the output node in this " "network." @@ -225,12 +227,12 @@ def write_config_files(config_dir, all_layers): raise -def add_back_compatibility_info(config_dir, existing_model=None, - nnet_edits=None): +def add_nnet_context_info(config_dir, existing_model=None, + edits_config=None): """This will be removed when python script refactoring is done.""" model = "{0}/ref.raw".format(config_dir) - if nnet_edits is not None: - model = """ - | nnet3-copy --edits-config={0} - {1}""".format(nnet_edits, + if edits_config is not None: + model = """ - | nnet3-copy --edits-config={0} - {1}""".format(edits_config, model) common_lib.run_kaldi_command("""nnet3-init {0} {1}/ref.config """ """ {2} """.format(existing_model if @@ -250,27 +252,25 @@ def add_back_compatibility_info(config_dir, existing_model=None, continue info[parts[0].strip()] = int(parts[1].strip()) - # Writing the back-compatible vars file + # Writing the 'vars' file: # model_left_context=0 # model_right_context=7 - # num_hidden_layers=3 vf = open('{0}/vars'.format(config_dir), 'w') vf.write('model_left_context={0}\n'.format(info['left-context'])) vf.write('model_right_context={0}\n'.format(info['right-context'])) - vf.write('num_hidden_layers=1\n') vf.close() common_lib.force_symlink("final.config".format(config_dir), "{0}/layer1.config".format(config_dir)) -def check_model_contexts(config_dir, existing_model=None, nnet_edits=None): +def check_model_contexts(config_dir, existing_model=None, edits_config=None): contexts = {} for file_name in ['init', 'ref']: if os.path.exists('{0}/{1}.config'.format(config_dir, file_name)): contexts[file_name] = {} model = "{0}/{1}.raw".format(config_dir, file_name) - if nnet_edits is not None: - model = """ - | nnet3-copy --edits-config={0} - {1}""".format(nnet_edits, + if edits_config is not None: + model = """ - | nnet3-copy --edits-config={0} - {1}""".format(edits_config, model) common_lib.run_kaldi_command("""nnet3-init {0} {1}/{2}.config """ """ {3} """.format(existing_model if @@ -313,13 +313,13 @@ def main(): backup_xconfig_file(args.xconfig_file, args.config_dir) aux_layers = [] if args.existing_model is not None: - aux_layers = xparser.read_model(args.existing_model) + aux_layers = xparser.get_model_component_info(args.existing_model) all_layers = xparser.read_xconfig_file(args.xconfig_file, aux_layers) write_expanded_xconfig_files(args.config_dir, all_layers) write_config_files(args.config_dir, all_layers) - check_model_contexts(args.config_dir, args.existing_model, args.nnet_edits) - add_back_compatibility_info(args.config_dir, args.existing_model, - args.nnet_edits) + check_model_contexts(args.config_dir, args.existing_model, args.edits_config) + add_nnet_context_info(args.config_dir, args.existing_model, + args.edits_config) if __name__ == '__main__': From 55cc6f94e00a49bcd188ab8b16ac4eafc197b5da Mon Sep 17 00:00:00 2001 From: Hossein Hadian Date: Mon, 29 May 2017 02:27:02 -0400 Subject: [PATCH 005/174] [WIP] Add chain semi-supervised script + src changes --- .../local/chain/run_semisupervised_1a.sh | 158 ++++++++++++++++++ .../s5_r2/local/chain/tuning/run_tdnn_1d.sh | 3 +- egs/wsj/s5/steps/nnet3/chain/get_egs.sh | 4 + src/chain/chain-supervision.cc | 54 +++++- src/chain/chain-supervision.h | 7 +- 5 files changed, 218 insertions(+), 8 deletions(-) create mode 100755 egs/tedlium/s5_r2/local/chain/run_semisupervised_1a.sh diff --git a/egs/tedlium/s5_r2/local/chain/run_semisupervised_1a.sh b/egs/tedlium/s5_r2/local/chain/run_semisupervised_1a.sh new file mode 100755 index 00000000000..3760b755bb9 --- /dev/null +++ b/egs/tedlium/s5_r2/local/chain/run_semisupervised_1a.sh @@ -0,0 +1,158 @@ +#!/bin/bash + +set -e -o pipefail + + +#TODO: change some of these _sup to _semi + + +stage=0 +train_sup_stage_opt="--stage -10 --train-stage -10" +nj=30 +decode_nj=30 +base_supervised_set=train_cleaned +supervised_set=${base_supervised_set}_sup +unsupervised_set=${base_supervised_set}_unsup +base_gmm=tri3_cleaned # the starting point of training on the supervised data (no flat start for now) +gmm=${base_gmm}_sup # the gmm to be supplied to chain/run_tdnn.sh +nnet3_affix=_cleaned_sup # cleanup affix for nnet3 and chain dirs +tdnn_affix=_sup1d #affix for TDNN directory, e.g. "a" or "b", in case we change the configuration. +lattice_lm_scale=0.1 +left_tolerance=2 +right_tolerance=2 + +num_iters=3 +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat < data/$supervised_set/supervised_uttlist || true + utils/shuffle_list.pl data/$base_supervised_set/feats.scp | cut -d' ' -f1 | tail -$num_unsupervised_utts > data/$supervised_set/unsupervised_uttlist || true + utils/subset_data_dir.sh --utt-list data/$supervised_set/supervised_uttlist data/$base_supervised_set data/$supervised_set || exit 1 + utils/subset_data_dir.sh --utt-list data/$supervised_set/unsupervised_uttlist data/$base_supervised_set data/$unsupervised_set || exit 1 + utils/data/subset_data_dir.sh --utt-list data/$unsupervised_set/feats.scp data/${base_supervised_set}_sp_hires data/${unsupervised_set}_hires +fi + +if [ $stage -le -3 ]; then + # align the supervised subset with the current cleaned gmm + if [ -f $gmm/ali.1.gz ]; then + echo "$0: alignments in $gmm appear to already exist. Please either remove them " + echo " ... or use a later --stage option." + exit 1 + fi + echo "$0: aligning with the supervised data" + steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \ + data/${supervised_set} data/lang exp/$base_gmm exp/$gmm + exit 0; +fi + +if [ $stage -le -2 ]; then + echo "$0: training on the supervised subset" + local/chain/run_tdnn.sh $train_sup_stage_opt --remove-egs false --train-set $supervised_set --gmm $gmm --nnet3-affix $nnet3_affix --tdnn-affix $tdnn_affix + exit 0; +fi + +if [ $stage -le -1 ]; then + echo "$0: getting ivectors for the unsupervised data" + steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj "$nj" \ + data/${unsupervised_set}_hires exp/nnet3${nnet3_affix}/extractor \ + exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires +fi + +chaindir=exp/chain${nnet3_affix}/tdnn${tdnn_affix}_sp_bi +sup_chaindir=$chaindir + +left_context=`cat $chaindir/egs/info/left_context` +right_context=`cat $chaindir/egs/info/right_context` +left_context_initial=`cat $chaindir/egs/info/left_context_initial` +right_context_final=`cat $chaindir/egs/info/right_context_final` +frames_per_eg=`cat $chaindir/egs/info/frames_per_eg` +frame_subsampling_factor=`cat $chaindir/frame_subsampling_factor` +cmvn_opts=`cat $chaindir/cmvn_opts` + +for iter in $(seq 0 $[$num_iters-1]); do + echo "$0: iteration: $iter" + + if [ $iter -ge $stage ]; then + echo "$0: getting the decoding lattices for the unsupervised subset using the chain model at: $chaindir" + steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires \ + --scoring-opts "--min-lmwt 5 " \ + $chaindir/graph data/${unsupervised_set}_hires $chaindir/decode_${unsupervised_set} + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \ + data/${unsupervised_set}_hires \ + ${chaindir}/decode_${unsupervised_set} ${chaindir}/decode_${unsupervised_set}_rescore + ln -s ../final.mdl $chaindir/decode_${unsupervised_set}_rescore/final.mdl || true + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + --left-tolerance $left_tolerance --right-tolerance $right_tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires \ + data/${unsupervised_set}_hires $chaindir ${chaindir}/decode_${unsupervised_set}_rescore $chaindir/unsup_egs + + # TODO: set supervision.weight for the unsupervised data and tune it (maybe try 0.25,0.5,0.75) + echo "$0: combining supervised/unsupervised egs" + num_archives=`cat $sup_chaindir/egs/info/num_archives` + sup_egs_dir=$sup_chaindir/egs + unsup_egs_dir=$chaindir/unsup_egs + comb_egs_dir=$chaindir/comb_egs + mkdir -p $comb_egs_dir/log + cp {$sup_egs_dir,$comb_egs_dir}/train_diagnostic.cegs + cp {$sup_egs_dir,$comb_egs_dir}/valid_diagnostic.cegs + cp {$sup_egs_dir,$comb_egs_dir}/combine.cegs + cp {$sup_egs_dir,$comb_egs_dir}/cmvn_opts + cp -r $sup_egs_dir/info $comb_egs_dir + cat {$sup_egs_dir,$unsup_egs_dir}/info/num_frames | awk '{s+=$1} END{print s}' > $comb_egs_dir/info/num_frames + cat {$sup_egs_dir,$unsup_egs_dir}/info/egs_per_archive | awk '{s+=$1} END{print s}' > $comb_egs_dir/info/egs_per_archive + out_egs_list= + egs_list= + for n in $(seq $num_archives); do + egs_list="$egs_list $sup_egs_dir/cegs.$n.ark" + egs_list="$egs_list $unsup_egs_dir/cegs.$n.ark" + out_egs_list="$out_egs_list ark:$comb_egs_dir/cegs.$n.ark" + done + srand=0 + # $decode_cmd --mem 8G $comb_egs_dir/log/shuffle_combine.log \ + # nnet3-chain-shuffle-egs --srand=$srand "ark:cat $egs_list|" ark:- \| \ + # nnet3-chain-copy-egs --random=true --srand=$srand ark:- $out_egs_list + $decode_cmd $comb_egs_dir/log/combine.log \ + nnet3-chain-copy-egs "ark:cat $egs_list|" $out_egs_list + + echo "$0: training on the supervised+unsupervised subset" + # the train-set and gmm do not matter as we are providing the egs + local/chain/run_tdnn.sh --stage 17 --remove-egs false --train-set $supervised_set --gmm $gmm \ + --nnet3-affix $nnet3_affix --tdnn-affix ${tdnn_affix}_comb$[iter+1] --common-egs-dir $chaindir/comb_egs + fi + chaindir=exp/chain${nnet3_affix}/tdnn${tdnn_affix}_comb$[iter+1]_sp_bi + +done diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1d.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1d.sh index 99921a9bf61..2d7f60c2b90 100755 --- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1d.sh +++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1d.sh @@ -59,6 +59,7 @@ train_stage=-10 tree_affix= # affix for tree directory, e.g. "a" or "b", in case we change the configuration. tdnn_affix=1d #affix for TDNN directory, e.g. "a" or "b", in case we change the configuration. common_egs_dir= # you can set this to use previously dumped egs. +remove_egs=true # End configuration section. echo "$0 $@" # Print the command line for logging @@ -218,7 +219,7 @@ if [ $stage -le 18 ]; then --trainer.optimization.initial-effective-lrate 0.001 \ --trainer.optimization.final-effective-lrate 0.0001 \ --trainer.max-param-change 2.0 \ - --cleanup.remove-egs true \ + --cleanup.remove-egs $remove_egs \ --feat-dir $train_data_dir \ --tree-dir $tree_dir \ --lat-dir $lat_dir \ diff --git a/egs/wsj/s5/steps/nnet3/chain/get_egs.sh b/egs/wsj/s5/steps/nnet3/chain/get_egs.sh index 4a61f8edaa7..e02c197fcd0 100755 --- a/egs/wsj/s5/steps/nnet3/chain/get_egs.sh +++ b/egs/wsj/s5/steps/nnet3/chain/get_egs.sh @@ -64,6 +64,7 @@ online_ivector_dir= # can be used if we are including speaker information as iV cmvn_opts= # can be used for specifying CMVN options, if feature type is not lda (if lda, # it doesn't make sense to use different options than were used as input to the # LDA transform). This is used to turn off CMVN in the online-nnet experiments. +lattice_lm_scale= echo "$0 $@" # Print the command line for logging @@ -304,6 +305,9 @@ chain_supervision_all_opts="--lattice-input=true --frame-subsampling-factor=$ali [ ! -z $left_tolerance ] && \ chain_supervision_all_opts="$chain_supervision_all_opts --left-tolerance=$left_tolerance" +[ ! -z $lattice_lm_scale ] && \ + chain_supervision_all_opts="$chain_supervision_all_opts --lm-scale=$lattice_lm_scale" + echo $left_context > $dir/info/left_context echo $right_context > $dir/info/right_context echo $left_context_initial > $dir/info/left_context_initial diff --git a/src/chain/chain-supervision.cc b/src/chain/chain-supervision.cc index b5597b15667..4f77345eee2 100644 --- a/src/chain/chain-supervision.cc +++ b/src/chain/chain-supervision.cc @@ -19,6 +19,7 @@ #include "chain/chain-supervision.h" #include "lat/lattice-functions.h" +#include "lat/push-lattice.h" #include "util/text-utils.h" #include "hmm/hmm-utils.h" #include @@ -142,9 +143,9 @@ bool ProtoSupervision::operator == (const ProtoSupervision &other) const { fst::Equal(fst, other.fst)); } -bool PhoneLatticeToProtoSupervision(const SupervisionOptions &opts, - const CompactLattice &lat, - ProtoSupervision *proto_supervision) { +bool PhoneLatticeToProtoSupervisionInternal(const SupervisionOptions &opts, + const CompactLattice &lat, + ProtoSupervision *proto_supervision) { opts.Check(); if (lat.NumStates() == 0) { KALDI_WARN << "Empty lattice provided"; @@ -176,9 +177,10 @@ bool PhoneLatticeToProtoSupervision(const SupervisionOptions &opts, return false; } proto_supervision->fst.AddArc(state, - fst::StdArc(phone, phone, - fst::TropicalWeight::One(), - lat_arc.nextstate)); + fst::StdArc(phone, phone, + fst::TropicalWeight(lat_arc.weight.Weight().Value1() + * opts.lm_scale), + lat_arc.nextstate)); int32 t_begin = std::max(0, (state_time - opts.left_tolerance)), t_end = std::min(num_frames, (next_state_time + opts.right_tolerance)), @@ -207,6 +209,46 @@ bool PhoneLatticeToProtoSupervision(const SupervisionOptions &opts, return true; } +bool NormalizePhoneLattice(CompactLattice *clat) { + if (clat->Properties(fst::kTopSorted, false) == 0) { + if (fst::TopSort(clat) == false) { + KALDI_WARN << "Cycles detected in lattice: cannot normalize."; + return false; + } + } + + std::vector beta; + if (!ComputeCompactLatticeBetas(*clat, &beta)) { + KALDI_WARN << "Failed to compute backward probabilities on lattice."; + return false; + } + + CompactLattice::Arc::StateId start = clat->Start(); // Should be 0 + BaseFloat total_backward_cost = beta[start]; + + for (fst::StateIterator sit(*clat); !sit.Done(); sit.Next()) { + CompactLatticeWeight f = clat->Final(sit.Value()); + LatticeWeight w = f.Weight(); + w.SetValue1(w.Value1() + total_backward_cost); + f.SetWeight(w); + clat->SetFinal(sit.Value(), f); + } + return fst::PushCompactLatticeWeights(clat); +} + +bool PhoneLatticeToProtoSupervision(const SupervisionOptions &opts, + const CompactLattice &lat, + ProtoSupervision *proto_supervision) { + if (opts.lm_scale == 0.0) { + return PhoneLatticeToProtoSupervisionInternal(opts, lat, proto_supervision); + } else { + CompactLattice normalized_clat(lat); + NormalizePhoneLattice(&normalized_clat); + return PhoneLatticeToProtoSupervisionInternal(opts, + normalized_clat, + proto_supervision); + } +} bool TimeEnforcerFst::GetArc(StateId s, Label ilabel, fst::StdArc* oarc) { // the following call will do the range-check on 'ilabel'. diff --git a/src/chain/chain-supervision.h b/src/chain/chain-supervision.h index a94f68ade90..c70f7c989a2 100644 --- a/src/chain/chain-supervision.h +++ b/src/chain/chain-supervision.h @@ -50,10 +50,12 @@ struct SupervisionOptions { int32 left_tolerance; int32 right_tolerance; int32 frame_subsampling_factor; + BaseFloat lm_scale; SupervisionOptions(): left_tolerance(5), right_tolerance(5), - frame_subsampling_factor(1) { } + frame_subsampling_factor(1), + lm_scale(0.0) { } void Register(OptionsItf *opts) { opts->Register("left-tolerance", &left_tolerance, "Left tolerance for " @@ -65,6 +67,9 @@ struct SupervisionOptions { "frame-rate of the original alignment. Applied after " "left-tolerance and right-tolerance are applied (so they are " "in terms of the original num-frames."); + opts->Register("lm-scale", &lm_scale, "The scale with which the graph/lm " + "weights from the phone lattice are included in the " + "supervision fst."); } void Check() const; }; From b91276b40f0f4bb86306bebcac1655b19918e2d5 Mon Sep 17 00:00:00 2001 From: Hossein Hadian Date: Mon, 29 May 2017 18:02:55 -0400 Subject: [PATCH 006/174] Minor fixes --- .../s5_r2/local/chain/run_semisupervised.sh | 174 ++++++++++++++++++ .../local/chain/run_semisupervised_1a.sh | 158 ---------------- .../s5_r2/local/chain/tuning/run_tdnn_1d.sh | 3 +- 3 files changed, 176 insertions(+), 159 deletions(-) create mode 100755 egs/tedlium/s5_r2/local/chain/run_semisupervised.sh delete mode 100755 egs/tedlium/s5_r2/local/chain/run_semisupervised_1a.sh diff --git a/egs/tedlium/s5_r2/local/chain/run_semisupervised.sh b/egs/tedlium/s5_r2/local/chain/run_semisupervised.sh new file mode 100755 index 00000000000..3cde5924ca0 --- /dev/null +++ b/egs/tedlium/s5_r2/local/chain/run_semisupervised.sh @@ -0,0 +1,174 @@ +#!/bin/bash + +set -e -o pipefail + + +stage=0 +nj=30 +decode_nj=30 +base_train_set=train_cleaned # the starting point train-set +base_gmm=tri3_cleaned # the starting point of training on the supervised data (no flat start for now) +semi_affix= # affix relating train-set splitting proportion + # (currently supervised 25%) and the base train set (currently _cleaned), etc. +tdnn_affix=_sup1a # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# combination options +comb_affix=_comb1a # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +unsup_decode_lattice_beam=8.0 +unsup_frames_per_eg= # if empty will be equal to the supervised model's config +lattice_lm_scale=0.1 # lm-scale for using the weights from unsupervised lattices +left_tolerance=2 +right_tolerance=2 +train_combined_opts= + +# to tune: +# frames_per_eg for unsupervised + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +supervised_set=${base_train_set}_sup${semi_affix} +unsupervised_set=${base_train_set}_unsup${semi_affix} +gmm=${base_gmm}_semi${semi_affix} # the gmm to be supplied to chain/run_tdnn.sh +nnet3_affix=_cleaned_semi${semi_affix} # affix for nnet3 and chain dirs + +if ! cuda-compiled; then + cat < data/$supervised_set/supervised_uttlist || true + utils/shuffle_list.pl data/$base_train_set/feats.scp | cut -d' ' -f1 | \ + tail -$num_unsupervised_utts > data/$supervised_set/unsupervised_uttlist || true + utils/subset_data_dir.sh --utt-list data/$supervised_set/supervised_uttlist \ + data/$base_train_set data/$supervised_set || exit 1 + utils/subset_data_dir.sh --utt-list data/$supervised_set/unsupervised_uttlist \ + data/$base_train_set data/$unsupervised_set || exit 1 + utils/data/subset_data_dir.sh --utt-list data/$unsupervised_set/feats.scp \ + data/${base_train_set}_sp_hires data/${unsupervised_set}_hires +fi + +if [ $stage -le -3 ]; then + # align the supervised subset with the current cleaned gmm + if [ -f $gmm/ali.1.gz ]; then + echo "$0: alignments in $gmm appear to already exist. Please either remove them " + echo " ... or use a later --stage option." + exit 1 + fi + echo "$0: aligning with the supervised data data/${supervised_set}" + steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \ + data/${supervised_set} data/lang exp/$base_gmm exp/$gmm +fi + +if [ $stage -le -2 ]; then + echo "$0: chain training on the supervised subset data/${supervised_set}" + local/chain/run_tdnn.sh $train_supervised_opts --remove-egs false \ + --train-set $supervised_set --gmm $gmm \ + --nnet3-affix $nnet3_affix --tdnn-affix $tdnn_affix +fi + +if [ $stage -le -1 ]; then + echo "$0: getting ivectors for the hires unsupervised data data/${unsupervised_set}_hires" + steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj "$nj" \ + data/${unsupervised_set}_hires exp/nnet3${nnet3_affix}/extractor \ + exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires +fi + +chaindir=exp/chain${nnet3_affix}/tdnn${tdnn_affix}_sp_bi + +left_context=`cat $chaindir/egs/info/left_context` +right_context=`cat $chaindir/egs/info/right_context` +left_context_initial=`cat $chaindir/egs/info/left_context_initial` +right_context_final=`cat $chaindir/egs/info/right_context_final` +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=`cat $chaindir/egs/info/frames_per_eg` +frame_subsampling_factor=`cat $chaindir/frame_subsampling_factor` +cmvn_opts=`cat $chaindir/cmvn_opts` + +if [ $stage -le 0 ]; then + echo "$0: getting the decoding lattices for the unsupervised subset using the chain model at: $chaindir" + steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \ + --acwt 1.0 --post-decode-acwt 10.0 --lattice-beam $unsup_decode_lattice_beam \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires \ + --scoring-opts "--min-lmwt 5 " \ + $chaindir/graph data/${unsupervised_set}_hires $chaindir/decode_${unsupervised_set} + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \ + data/${unsupervised_set}_hires \ + ${chaindir}/decode_${unsupervised_set} ${chaindir}/decode_${unsupervised_set}_rescore + ln -s ../final.mdl $chaindir/decode_${unsupervised_set}_rescore/final.mdl || true +fi + +if [ $stage -le 1 ]; then + # TODO: set supervision.weight for the unsupervised data and tune it (maybe try 0.25,0.5,0.75) + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + --left-tolerance $left_tolerance --right-tolerance $right_tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires \ + data/${unsupervised_set}_hires $chaindir ${chaindir}/decode_${unsupervised_set}_rescore $chaindir/unsup_egs +fi + +if [ $stage -le 2 ]; then + echo "$0: combining supervised/unsupervised egs" + num_archives=`cat $sup_chaindir/egs/info/num_archives` + sup_egs_dir=$sup_chaindir/egs + unsup_egs_dir=$chaindir/unsup_egs + comb_egs_dir=$chaindir/comb_egs + mkdir -p $comb_egs_dir/log + cp {$sup_egs_dir,$comb_egs_dir}/train_diagnostic.cegs + cp {$sup_egs_dir,$comb_egs_dir}/valid_diagnostic.cegs + cp {$sup_egs_dir,$comb_egs_dir}/combine.cegs + cp {$sup_egs_dir,$comb_egs_dir}/cmvn_opts + cp -r $sup_egs_dir/info $comb_egs_dir + cat {$sup_egs_dir,$unsup_egs_dir}/info/num_frames | awk '{s+=$1} END{print s}' > $comb_egs_dir/info/num_frames + cat {$sup_egs_dir,$unsup_egs_dir}/info/egs_per_archive | awk '{s+=$1} END{print s}' > $comb_egs_dir/info/egs_per_archive + out_egs_list= + egs_list= + for n in $(seq $num_archives); do + egs_list="$egs_list $sup_egs_dir/cegs.$n.ark" + egs_list="$egs_list $unsup_egs_dir/cegs.$n.ark" + out_egs_list="$out_egs_list ark:$comb_egs_dir/cegs.$n.ark" + done + srand=0 + # $decode_cmd --mem 8G $comb_egs_dir/log/shuffle_combine.log \ + # nnet3-chain-shuffle-egs --srand=$srand "ark:cat $egs_list|" ark:- \| \ + # nnet3-chain-copy-egs --random=true --srand=$srand ark:- $out_egs_list + $decode_cmd $comb_egs_dir/log/combine.log \ + nnet3-chain-copy-egs "ark:cat $egs_list|" $out_egs_list +fi + +if [ $stage -le 3 ]; then + echo "$0: training on the supervised+unsupervised subset" + # the train-set and gmm do not matter as we are providing the egs + local/chain/run_tdnn.sh --stage 17 --remove-egs false --train-set $supervised_set --gmm $gmm \ + --nnet3-affix $nnet3_affix --tdnn-affix ${tdnn_affix}${comb_affix} \ + --common-egs-dir $chaindir/comb_egs $train_combined_opts +fi + +#final_combined_chaindir=exp/chain${nnet3_affix}/tdnn${tdnn_affix}${comb_affix}_sp_bi + diff --git a/egs/tedlium/s5_r2/local/chain/run_semisupervised_1a.sh b/egs/tedlium/s5_r2/local/chain/run_semisupervised_1a.sh deleted file mode 100755 index 3760b755bb9..00000000000 --- a/egs/tedlium/s5_r2/local/chain/run_semisupervised_1a.sh +++ /dev/null @@ -1,158 +0,0 @@ -#!/bin/bash - -set -e -o pipefail - - -#TODO: change some of these _sup to _semi - - -stage=0 -train_sup_stage_opt="--stage -10 --train-stage -10" -nj=30 -decode_nj=30 -base_supervised_set=train_cleaned -supervised_set=${base_supervised_set}_sup -unsupervised_set=${base_supervised_set}_unsup -base_gmm=tri3_cleaned # the starting point of training on the supervised data (no flat start for now) -gmm=${base_gmm}_sup # the gmm to be supplied to chain/run_tdnn.sh -nnet3_affix=_cleaned_sup # cleanup affix for nnet3 and chain dirs -tdnn_affix=_sup1d #affix for TDNN directory, e.g. "a" or "b", in case we change the configuration. -lattice_lm_scale=0.1 -left_tolerance=2 -right_tolerance=2 - -num_iters=3 -# End configuration section. -echo "$0 $@" # Print the command line for logging - -. cmd.sh -. ./path.sh -. ./utils/parse_options.sh - - -if ! cuda-compiled; then - cat < data/$supervised_set/supervised_uttlist || true - utils/shuffle_list.pl data/$base_supervised_set/feats.scp | cut -d' ' -f1 | tail -$num_unsupervised_utts > data/$supervised_set/unsupervised_uttlist || true - utils/subset_data_dir.sh --utt-list data/$supervised_set/supervised_uttlist data/$base_supervised_set data/$supervised_set || exit 1 - utils/subset_data_dir.sh --utt-list data/$supervised_set/unsupervised_uttlist data/$base_supervised_set data/$unsupervised_set || exit 1 - utils/data/subset_data_dir.sh --utt-list data/$unsupervised_set/feats.scp data/${base_supervised_set}_sp_hires data/${unsupervised_set}_hires -fi - -if [ $stage -le -3 ]; then - # align the supervised subset with the current cleaned gmm - if [ -f $gmm/ali.1.gz ]; then - echo "$0: alignments in $gmm appear to already exist. Please either remove them " - echo " ... or use a later --stage option." - exit 1 - fi - echo "$0: aligning with the supervised data" - steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \ - data/${supervised_set} data/lang exp/$base_gmm exp/$gmm - exit 0; -fi - -if [ $stage -le -2 ]; then - echo "$0: training on the supervised subset" - local/chain/run_tdnn.sh $train_sup_stage_opt --remove-egs false --train-set $supervised_set --gmm $gmm --nnet3-affix $nnet3_affix --tdnn-affix $tdnn_affix - exit 0; -fi - -if [ $stage -le -1 ]; then - echo "$0: getting ivectors for the unsupervised data" - steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj "$nj" \ - data/${unsupervised_set}_hires exp/nnet3${nnet3_affix}/extractor \ - exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires -fi - -chaindir=exp/chain${nnet3_affix}/tdnn${tdnn_affix}_sp_bi -sup_chaindir=$chaindir - -left_context=`cat $chaindir/egs/info/left_context` -right_context=`cat $chaindir/egs/info/right_context` -left_context_initial=`cat $chaindir/egs/info/left_context_initial` -right_context_final=`cat $chaindir/egs/info/right_context_final` -frames_per_eg=`cat $chaindir/egs/info/frames_per_eg` -frame_subsampling_factor=`cat $chaindir/frame_subsampling_factor` -cmvn_opts=`cat $chaindir/cmvn_opts` - -for iter in $(seq 0 $[$num_iters-1]); do - echo "$0: iteration: $iter" - - if [ $iter -ge $stage ]; then - echo "$0: getting the decoding lattices for the unsupervised subset using the chain model at: $chaindir" - steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \ - --acwt 1.0 --post-decode-acwt 10.0 \ - --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires \ - --scoring-opts "--min-lmwt 5 " \ - $chaindir/graph data/${unsupervised_set}_hires $chaindir/decode_${unsupervised_set} - steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \ - data/${unsupervised_set}_hires \ - ${chaindir}/decode_${unsupervised_set} ${chaindir}/decode_${unsupervised_set}_rescore - ln -s ../final.mdl $chaindir/decode_${unsupervised_set}_rescore/final.mdl || true - - echo "$0: generating egs from the unsupervised data" - steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ - --left-tolerance $left_tolerance --right-tolerance $right_tolerance \ - --left-context $left_context --right-context $right_context \ - --left-context-initial $left_context_initial --right-context-final $right_context_final \ - --frames-per-eg $frames_per_eg --frames-per-iter 1500000 \ - --frame-subsampling-factor $frame_subsampling_factor \ - --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ - --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires \ - data/${unsupervised_set}_hires $chaindir ${chaindir}/decode_${unsupervised_set}_rescore $chaindir/unsup_egs - - # TODO: set supervision.weight for the unsupervised data and tune it (maybe try 0.25,0.5,0.75) - echo "$0: combining supervised/unsupervised egs" - num_archives=`cat $sup_chaindir/egs/info/num_archives` - sup_egs_dir=$sup_chaindir/egs - unsup_egs_dir=$chaindir/unsup_egs - comb_egs_dir=$chaindir/comb_egs - mkdir -p $comb_egs_dir/log - cp {$sup_egs_dir,$comb_egs_dir}/train_diagnostic.cegs - cp {$sup_egs_dir,$comb_egs_dir}/valid_diagnostic.cegs - cp {$sup_egs_dir,$comb_egs_dir}/combine.cegs - cp {$sup_egs_dir,$comb_egs_dir}/cmvn_opts - cp -r $sup_egs_dir/info $comb_egs_dir - cat {$sup_egs_dir,$unsup_egs_dir}/info/num_frames | awk '{s+=$1} END{print s}' > $comb_egs_dir/info/num_frames - cat {$sup_egs_dir,$unsup_egs_dir}/info/egs_per_archive | awk '{s+=$1} END{print s}' > $comb_egs_dir/info/egs_per_archive - out_egs_list= - egs_list= - for n in $(seq $num_archives); do - egs_list="$egs_list $sup_egs_dir/cegs.$n.ark" - egs_list="$egs_list $unsup_egs_dir/cegs.$n.ark" - out_egs_list="$out_egs_list ark:$comb_egs_dir/cegs.$n.ark" - done - srand=0 - # $decode_cmd --mem 8G $comb_egs_dir/log/shuffle_combine.log \ - # nnet3-chain-shuffle-egs --srand=$srand "ark:cat $egs_list|" ark:- \| \ - # nnet3-chain-copy-egs --random=true --srand=$srand ark:- $out_egs_list - $decode_cmd $comb_egs_dir/log/combine.log \ - nnet3-chain-copy-egs "ark:cat $egs_list|" $out_egs_list - - echo "$0: training on the supervised+unsupervised subset" - # the train-set and gmm do not matter as we are providing the egs - local/chain/run_tdnn.sh --stage 17 --remove-egs false --train-set $supervised_set --gmm $gmm \ - --nnet3-affix $nnet3_affix --tdnn-affix ${tdnn_affix}_comb$[iter+1] --common-egs-dir $chaindir/comb_egs - fi - chaindir=exp/chain${nnet3_affix}/tdnn${tdnn_affix}_comb$[iter+1]_sp_bi - -done diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1d.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1d.sh index 2d7f60c2b90..1c4a032fc57 100755 --- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1d.sh +++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1d.sh @@ -52,6 +52,7 @@ train_set=train_cleaned gmm=tri3_cleaned # the gmm for the target data num_threads_ubm=32 nnet3_affix=_cleaned # cleanup affix for nnet3 and chain dirs, e.g. _cleaned +num_epochs=4 # The rest are configs specific to this script. Most of the parameters # are just hardcoded at this level, in the commands below. @@ -213,7 +214,7 @@ if [ $stage -le 18 ]; then --egs.chunk-width 150 \ --trainer.num-chunk-per-minibatch 128 \ --trainer.frames-per-iter 1500000 \ - --trainer.num-epochs 4 \ + --trainer.num-epochs $num_epochs \ --trainer.optimization.num-jobs-initial 2 \ --trainer.optimization.num-jobs-final 12 \ --trainer.optimization.initial-effective-lrate 0.001 \ From ba261203fa2e3982f5cc5ff8464b4c9db5205717 Mon Sep 17 00:00:00 2001 From: Hossein Hadian Date: Tue, 30 May 2017 00:40:52 -0400 Subject: [PATCH 007/174] Add more options to run_semisupervised.sh --- .../s5_r2/local/chain/run_semisupervised.sh | 30 +++++++++++-------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/egs/tedlium/s5_r2/local/chain/run_semisupervised.sh b/egs/tedlium/s5_r2/local/chain/run_semisupervised.sh index 3cde5924ca0..824b3ce5f0b 100755 --- a/egs/tedlium/s5_r2/local/chain/run_semisupervised.sh +++ b/egs/tedlium/s5_r2/local/chain/run_semisupervised.sh @@ -14,13 +14,15 @@ tdnn_affix=_sup1a # affix for the supervised chain-model directory train_supervised_opts="--stage -10 --train-stage -10" # combination options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir comb_affix=_comb1a # affix for new chain-model directory trained on the combined supervised+unsupervised subsets unsup_decode_lattice_beam=8.0 unsup_frames_per_eg= # if empty will be equal to the supervised model's config lattice_lm_scale=0.1 # lm-scale for using the weights from unsupervised lattices left_tolerance=2 right_tolerance=2 -train_combined_opts= +train_combined_opts="--num-epochs 5" # to tune: # frames_per_eg for unsupervised @@ -77,7 +79,7 @@ if [ $stage -le -3 ]; then echo " ... or use a later --stage option." exit 1 fi - echo "$0: aligning with the supervised data data/${supervised_set}" + echo "$0: aligning the supervised data data/${supervised_set}" steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \ data/${supervised_set} data/lang exp/$base_gmm exp/$gmm fi @@ -112,11 +114,11 @@ if [ $stage -le 0 ]; then --acwt 1.0 --post-decode-acwt 10.0 --lattice-beam $unsup_decode_lattice_beam \ --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires \ --scoring-opts "--min-lmwt 5 " \ - $chaindir/graph data/${unsupervised_set}_hires $chaindir/decode_${unsupervised_set} + $chaindir/graph data/${unsupervised_set}_hires $chaindir/decode_${unsupervised_set}${decode_affix} steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \ data/${unsupervised_set}_hires \ - ${chaindir}/decode_${unsupervised_set} ${chaindir}/decode_${unsupervised_set}_rescore - ln -s ../final.mdl $chaindir/decode_${unsupervised_set}_rescore/final.mdl || true + ${chaindir}/decode_${unsupervised_set}${decode_affix} ${chaindir}/decode_${unsupervised_set}${decode_affix}_rescore + ln -s ../final.mdl $chaindir/decode_${unsupervised_set}${decode_affix}_rescore/final.mdl || true fi if [ $stage -le 1 ]; then @@ -130,15 +132,16 @@ if [ $stage -le 1 ]; then --frame-subsampling-factor $frame_subsampling_factor \ --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires \ - data/${unsupervised_set}_hires $chaindir ${chaindir}/decode_${unsupervised_set}_rescore $chaindir/unsup_egs + data/${unsupervised_set}_hires $chaindir \ + ${chaindir}/decode_${unsupervised_set}${decode_affix}_rescore $chaindir/unsup_egs${decode_affix}${egs_affix} fi +sup_egs_dir=$chaindir/egs +unsup_egs_dir=$chaindir/unsup_egs${decode_affix}${egs_affix} +comb_egs_dir=$chaindir/comb_egs${decode_affix}${egs_affix} if [ $stage -le 2 ]; then echo "$0: combining supervised/unsupervised egs" - num_archives=`cat $sup_chaindir/egs/info/num_archives` - sup_egs_dir=$sup_chaindir/egs - unsup_egs_dir=$chaindir/unsup_egs - comb_egs_dir=$chaindir/comb_egs + num_archives=`cat $chaindir/egs/info/num_archives` mkdir -p $comb_egs_dir/log cp {$sup_egs_dir,$comb_egs_dir}/train_diagnostic.cegs cp {$sup_egs_dir,$comb_egs_dir}/valid_diagnostic.cegs @@ -159,15 +162,16 @@ if [ $stage -le 2 ]; then # nnet3-chain-shuffle-egs --srand=$srand "ark:cat $egs_list|" ark:- \| \ # nnet3-chain-copy-egs --random=true --srand=$srand ark:- $out_egs_list $decode_cmd $comb_egs_dir/log/combine.log \ - nnet3-chain-copy-egs "ark:cat $egs_list|" $out_egs_list + nnet3-chain-copy-egs "ark:cat $egs_list|" $out_egs_list fi if [ $stage -le 3 ]; then echo "$0: training on the supervised+unsupervised subset" # the train-set and gmm do not matter as we are providing the egs local/chain/run_tdnn.sh --stage 17 --remove-egs false --train-set $supervised_set --gmm $gmm \ - --nnet3-affix $nnet3_affix --tdnn-affix ${tdnn_affix}${comb_affix} \ - --common-egs-dir $chaindir/comb_egs $train_combined_opts + --nnet3-affix $nnet3_affix \ + --tdnn-affix ${tdnn_affix}${decode_affix}${egs_affix}${comb_affix} \ + --common-egs-dir $comb_egs_dir $train_combined_opts fi #final_combined_chaindir=exp/chain${nnet3_affix}/tdnn${tdnn_affix}${comb_affix}_sp_bi From 477bdf3b18b324739aa235b5dc22570d171be655 Mon Sep 17 00:00:00 2001 From: Hossein Hadian Date: Wed, 31 May 2017 14:23:27 -0400 Subject: [PATCH 008/174] Add a check in supervision code --- src/chain/chain-supervision.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/chain/chain-supervision.cc b/src/chain/chain-supervision.cc index 4f77345eee2..4cbe5f56137 100644 --- a/src/chain/chain-supervision.cc +++ b/src/chain/chain-supervision.cc @@ -243,7 +243,8 @@ bool PhoneLatticeToProtoSupervision(const SupervisionOptions &opts, return PhoneLatticeToProtoSupervisionInternal(opts, lat, proto_supervision); } else { CompactLattice normalized_clat(lat); - NormalizePhoneLattice(&normalized_clat); + if (!NormalizePhoneLattice(&normalized_clat)) + return false; // Already warned return PhoneLatticeToProtoSupervisionInternal(opts, normalized_clat, proto_supervision); From f55f686ca155fc0e8e0543fd19f61e9429aae544 Mon Sep 17 00:00:00 2001 From: Hossein Hadian Date: Fri, 2 Jun 2017 18:28:14 -0400 Subject: [PATCH 009/174] Some fixes + new options --- .../s5_r2/local/chain/run_semisupervised.sh | 19 ++++---- egs/wsj/s5/steps/nnet3/chain/get_egs.sh | 13 ++++- src/chain/chain-supervision.cc | 48 ++++--------------- src/chain/chain-supervision.h | 10 +++- 4 files changed, 40 insertions(+), 50 deletions(-) diff --git a/egs/tedlium/s5_r2/local/chain/run_semisupervised.sh b/egs/tedlium/s5_r2/local/chain/run_semisupervised.sh index 824b3ce5f0b..674b8745c42 100755 --- a/egs/tedlium/s5_r2/local/chain/run_semisupervised.sh +++ b/egs/tedlium/s5_r2/local/chain/run_semisupervised.sh @@ -2,6 +2,12 @@ set -e -o pipefail +# e.g. try lm-scale: +# local/chain/run_semisupervised.sh --stage 1 --tdnn-affix _sup1a --egs-affix _lmwt1.0 --lattice-lm-scale 1.0 + + +# frames_per_eg 300 +# local/chain/run_semisupervised.sh --stage 1 --tdnn-affix _sup1d --unsup-frames-per-eg 300 --egs-affix _fpe300 stage=0 nj=30 @@ -17,9 +23,10 @@ train_supervised_opts="--stage -10 --train-stage -10" decode_affix= egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir comb_affix=_comb1a # affix for new chain-model directory trained on the combined supervised+unsupervised subsets -unsup_decode_lattice_beam=8.0 unsup_frames_per_eg= # if empty will be equal to the supervised model's config +unsup_egs_weight=1.0 lattice_lm_scale=0.1 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam= # If supplied will prune the lattices prior to getting egs for unsupervised data left_tolerance=2 right_tolerance=2 train_combined_opts="--num-epochs 5" @@ -111,7 +118,7 @@ cmvn_opts=`cat $chaindir/cmvn_opts` if [ $stage -le 0 ]; then echo "$0: getting the decoding lattices for the unsupervised subset using the chain model at: $chaindir" steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \ - --acwt 1.0 --post-decode-acwt 10.0 --lattice-beam $unsup_decode_lattice_beam \ + --acwt 1.0 --post-decode-acwt 10.0 \ --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires \ --scoring-opts "--min-lmwt 5 " \ $chaindir/graph data/${unsupervised_set}_hires $chaindir/decode_${unsupervised_set}${decode_affix} @@ -122,7 +129,6 @@ if [ $stage -le 0 ]; then fi if [ $stage -le 1 ]; then - # TODO: set supervision.weight for the unsupervised data and tune it (maybe try 0.25,0.5,0.75) echo "$0: generating egs from the unsupervised data" steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ --left-tolerance $left_tolerance --right-tolerance $right_tolerance \ @@ -131,6 +137,8 @@ if [ $stage -le 1 ]; then --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ --frame-subsampling-factor $frame_subsampling_factor \ --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --egs-weight $unsup_egs_weight \ --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires \ data/${unsupervised_set}_hires $chaindir \ ${chaindir}/decode_${unsupervised_set}${decode_affix}_rescore $chaindir/unsup_egs${decode_affix}${egs_affix} @@ -158,9 +166,6 @@ if [ $stage -le 2 ]; then out_egs_list="$out_egs_list ark:$comb_egs_dir/cegs.$n.ark" done srand=0 - # $decode_cmd --mem 8G $comb_egs_dir/log/shuffle_combine.log \ - # nnet3-chain-shuffle-egs --srand=$srand "ark:cat $egs_list|" ark:- \| \ - # nnet3-chain-copy-egs --random=true --srand=$srand ark:- $out_egs_list $decode_cmd $comb_egs_dir/log/combine.log \ nnet3-chain-copy-egs "ark:cat $egs_list|" $out_egs_list fi @@ -174,5 +179,3 @@ if [ $stage -le 3 ]; then --common-egs-dir $comb_egs_dir $train_combined_opts fi -#final_combined_chaindir=exp/chain${nnet3_affix}/tdnn${tdnn_affix}${comb_affix}_sp_bi - diff --git a/egs/wsj/s5/steps/nnet3/chain/get_egs.sh b/egs/wsj/s5/steps/nnet3/chain/get_egs.sh index e02c197fcd0..50b3dc60572 100755 --- a/egs/wsj/s5/steps/nnet3/chain/get_egs.sh +++ b/egs/wsj/s5/steps/nnet3/chain/get_egs.sh @@ -64,7 +64,13 @@ online_ivector_dir= # can be used if we are including speaker information as iV cmvn_opts= # can be used for specifying CMVN options, if feature type is not lda (if lda, # it doesn't make sense to use different options than were used as input to the # LDA transform). This is used to turn off CMVN in the online-nnet experiments. -lattice_lm_scale= +lattice_lm_scale= # If supplied, the graph/lm weight of the lattices will be + # used (with this scale) in generating supervisions +egs_weight=1.0 # The weight which determines how much each training example + # contributes to gradients while training (can be used + # to down/up-weight a dataset) +lattice_prune_beam= # If supplied, the lattices will be pruned to this beam, + # before being used to get supervisions. echo "$0 $@" # Print the command line for logging @@ -286,8 +292,10 @@ fi if [ $stage -le 2 ]; then echo "$0: copying training lattices" + [ ! -z $lattice_prune_beam ] && \ + prune_cmd="ark:- | lattice-prune --acoustic-scale=0.1 --beam=$lattice_prune_beam ark:-" $cmd --max-jobs-run 6 JOB=1:$num_lat_jobs $dir/log/lattice_copy.JOB.log \ - lattice-copy "ark:gunzip -c $latdir/lat.JOB.gz|" ark,scp:$dir/lat.JOB.ark,$dir/lat.JOB.scp || exit 1; + lattice-copy "ark:gunzip -c $latdir/lat.JOB.gz|" $prune_cmd ark,scp:$dir/lat.JOB.ark,$dir/lat.JOB.scp || exit 1; for id in $(seq $num_lat_jobs); do cat $dir/lat.$id.scp; done > $dir/lat.scp fi @@ -386,6 +394,7 @@ if [ $stage -le 4 ]; then utils/filter_scp.pl $sdata/JOB/utt2spk $dir/lat.scp \| \ lattice-align-phones --replace-output-symbols=true $latdir/final.mdl scp:- ark:- \| \ chain-get-supervision $chain_supervision_all_opts \ + --weight=$egs_weight \ $chaindir/tree $chaindir/0.trans_mdl ark:- ark:- \| \ nnet3-chain-get-egs $ivector_opts --srand=\$[JOB+$srand] $egs_opts \ --num-frames-overlap=$frames_overlap_per_eg \ diff --git a/src/chain/chain-supervision.cc b/src/chain/chain-supervision.cc index 4cbe5f56137..35489ca5e22 100644 --- a/src/chain/chain-supervision.cc +++ b/src/chain/chain-supervision.cc @@ -179,7 +179,7 @@ bool PhoneLatticeToProtoSupervisionInternal(const SupervisionOptions &opts, proto_supervision->fst.AddArc(state, fst::StdArc(phone, phone, fst::TropicalWeight(lat_arc.weight.Weight().Value1() - * opts.lm_scale), + * opts.lm_scale + opts.phone_ins_penalty), lat_arc.nextstate)); int32 t_begin = std::max(0, (state_time - opts.left_tolerance)), t_end = std::min(num_frames, @@ -191,7 +191,8 @@ bool PhoneLatticeToProtoSupervisionInternal(const SupervisionOptions &opts, proto_supervision->allowed_phones[t_subsampled].push_back(phone); } if (lat.Final(state) != CompactLatticeWeight::Zero()) { - proto_supervision->fst.SetFinal(state, fst::TropicalWeight::One()); + proto_supervision->fst.SetFinal(state, fst::TropicalWeight( + lat.Final(state).Weight().Value1() * opts.lm_scale)); if (state_times[state] != num_frames) { KALDI_WARN << "Time of final state " << state << " in lattice is " << "not equal to number of frames " << num_frames @@ -209,46 +210,15 @@ bool PhoneLatticeToProtoSupervisionInternal(const SupervisionOptions &opts, return true; } -bool NormalizePhoneLattice(CompactLattice *clat) { - if (clat->Properties(fst::kTopSorted, false) == 0) { - if (fst::TopSort(clat) == false) { - KALDI_WARN << "Cycles detected in lattice: cannot normalize."; - return false; - } - } - - std::vector beta; - if (!ComputeCompactLatticeBetas(*clat, &beta)) { - KALDI_WARN << "Failed to compute backward probabilities on lattice."; - return false; - } - - CompactLattice::Arc::StateId start = clat->Start(); // Should be 0 - BaseFloat total_backward_cost = beta[start]; - - for (fst::StateIterator sit(*clat); !sit.Done(); sit.Next()) { - CompactLatticeWeight f = clat->Final(sit.Value()); - LatticeWeight w = f.Weight(); - w.SetValue1(w.Value1() + total_backward_cost); - f.SetWeight(w); - clat->SetFinal(sit.Value(), f); - } - return fst::PushCompactLatticeWeights(clat); -} - bool PhoneLatticeToProtoSupervision(const SupervisionOptions &opts, const CompactLattice &lat, ProtoSupervision *proto_supervision) { - if (opts.lm_scale == 0.0) { - return PhoneLatticeToProtoSupervisionInternal(opts, lat, proto_supervision); - } else { - CompactLattice normalized_clat(lat); - if (!NormalizePhoneLattice(&normalized_clat)) - return false; // Already warned - return PhoneLatticeToProtoSupervisionInternal(opts, - normalized_clat, - proto_supervision); - } + if (!PhoneLatticeToProtoSupervisionInternal(opts, lat, proto_supervision)) + return false; + if (opts.lm_scale != 0.0) + fst::Push(&(proto_supervision->fst), + fst::REWEIGHT_TO_INITIAL, fst::kDelta, true); + return true; } bool TimeEnforcerFst::GetArc(StateId s, Label ilabel, fst::StdArc* oarc) { diff --git a/src/chain/chain-supervision.h b/src/chain/chain-supervision.h index c70f7c989a2..ce755f0cb63 100644 --- a/src/chain/chain-supervision.h +++ b/src/chain/chain-supervision.h @@ -50,12 +50,16 @@ struct SupervisionOptions { int32 left_tolerance; int32 right_tolerance; int32 frame_subsampling_factor; + BaseFloat weight; BaseFloat lm_scale; + BaseFloat phone_ins_penalty; SupervisionOptions(): left_tolerance(5), right_tolerance(5), frame_subsampling_factor(1), - lm_scale(0.0) { } + weight(1.0), + lm_scale(0.0), + phone_ins_penalty(0.0) { } void Register(OptionsItf *opts) { opts->Register("left-tolerance", &left_tolerance, "Left tolerance for " @@ -67,9 +71,13 @@ struct SupervisionOptions { "frame-rate of the original alignment. Applied after " "left-tolerance and right-tolerance are applied (so they are " "in terms of the original num-frames."); + opts->Register("weight", &weight, + "Use this to set the supervision weight for training"); opts->Register("lm-scale", &lm_scale, "The scale with which the graph/lm " "weights from the phone lattice are included in the " "supervision fst."); + opts->Register("phone-ins-penalty", &phone_ins_penalty, + "The penalty to penalize longer paths"); } void Check() const; }; From 403e3e25f52e06a222fe735bea536b02ebdbf0cb Mon Sep 17 00:00:00 2001 From: Hossein Hadian Date: Tue, 6 Jun 2017 12:38:10 -0400 Subject: [PATCH 010/174] Add nnet3, chain, and semi_sepervised scripts for fisher english --- .../s5/local/chain/compare_wer_general.sh | 106 +++++++++++ .../s5/local/chain/run_semisupervised.sh | 136 ++++++++++++++ egs/fisher_english/s5/local/chain/run_tdnn.sh | 168 ++++++++++++++++++ .../s5/local/nnet3/run_ivector_common.sh | 147 +++++++++++++++ egs/fisher_english/s5/local/nnet3/run_tdnn.sh | 98 ++++++++++ egs/fisher_english/s5/local/score.sh | 1 + 6 files changed, 656 insertions(+) create mode 100755 egs/fisher_english/s5/local/chain/compare_wer_general.sh create mode 100755 egs/fisher_english/s5/local/chain/run_semisupervised.sh create mode 100755 egs/fisher_english/s5/local/chain/run_tdnn.sh create mode 100755 egs/fisher_english/s5/local/nnet3/run_ivector_common.sh create mode 100644 egs/fisher_english/s5/local/nnet3/run_tdnn.sh diff --git a/egs/fisher_english/s5/local/chain/compare_wer_general.sh b/egs/fisher_english/s5/local/chain/compare_wer_general.sh new file mode 100755 index 00000000000..2f724c8ff81 --- /dev/null +++ b/egs/fisher_english/s5/local/chain/compare_wer_general.sh @@ -0,0 +1,106 @@ +#!/bin/bash + +# this script is used for comparing decoding results between systems. +# e.g. local/chain/compare_wer_general.sh exp/chain_cleaned/tdnn_{c,d}_sp +# For use with discriminatively trained systems you specify the epochs after a colon: +# for instance, +# local/chain/compare_wer_general.sh exp/chain_cleaned/tdnn_c_sp exp/chain_cleaned/tdnn_c_sp_smbr:{1,2,3} + + +echo "# $0 $*" + +include_looped=false +if [ "$1" == "--looped" ]; then + include_looped=true + shift +fi + +used_epochs=false + +# this function set_names is used to separate the epoch-related parts of the name +# [for discriminative training] and the regular parts of the name. +# If called with a colon-free directory name, like: +# set_names exp/chain_cleaned/tdnn_lstm1e_sp_bi_smbr +# it will set dir=exp/chain_cleaned/tdnn_lstm1e_sp_bi_smbr and epoch_infix="" +# If called with something like: +# set_names exp/chain_cleaned/tdnn_d_sp_smbr:3 +# it will set dir=exp/chain_cleaned/tdnn_d_sp_smbr and epoch_infix="_epoch3" + + +set_names() { + if [ $# != 1 ]; then + echo "compare_wer_general.sh: internal error" + exit 1 # exit the program + fi + dirname=$(echo $1 | cut -d: -f1) + epoch=$(echo $1 | cut -s -d: -f2) + if [ -z $epoch ]; then + epoch_infix="" + else + used_epochs=true + epoch_infix=_epoch${epoch} + fi +} + + + +echo -n "# System " +for x in $*; do printf "% 10s" " $(basename $x)"; done +echo + +strings=("# WER on dev " "# WER on test ") + +for n in 0 1; do + echo -n "${strings[$n]}" + for x in $*; do + set_names $x # sets $dirname and $epoch_infix + decode_names=(dev${epoch_infix} test${epoch_infix}) + wer=$(grep WER $dirname/decode_${decode_names[$n]}/wer* | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer + done + echo + if $include_looped; then + echo -n "# [looped:] " + for x in $*; do + set_names $x # sets $dirname and $epoch_infix + decode_names=(dev${epoch_infix} test${epoch_infix}) + wer=$(grep WER $dirname/decode_looped_${decode_names[$n]}/wer* | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer + done + echo + fi +done + + +if $used_epochs; then + exit 0; # the diagnostics aren't comparable between regular and discriminatively trained systems. +fi + +echo -n "# Final train prob " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -v xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final valid prob " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -v xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final train prob (xent)" +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -w xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final valid prob (xent)" +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -w xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done + +echo diff --git a/egs/fisher_english/s5/local/chain/run_semisupervised.sh b/egs/fisher_english/s5/local/chain/run_semisupervised.sh new file mode 100755 index 00000000000..50c9b04cf48 --- /dev/null +++ b/egs/fisher_english/s5/local/chain/run_semisupervised.sh @@ -0,0 +1,136 @@ +#!/bin/bash + +set -e -o pipefail + +stage=-2 +nj=30 +decode_nj=30 +base_train_set=train_comb350k # for reference + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup +semi_affix=350k # affix relating train-set splitting proportion + +tdnn_affix=_sup1a # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# combination options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +comb_affix=_comb1a # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +unsup_frames_per_eg= # if empty will be equal to the supervised model's config +unsup_egs_weight=1.0 +lattice_lm_scale=0.0 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=2.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +left_tolerance=2 +right_tolerance=2 +train_combined_opts="--num-epochs 4.5" + +# to tune: +# frames_per_eg for unsupervised + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_semi${semi_affix} # affix for nnet3 and chain dirs + +if ! cuda-compiled; then + cat <$n1?$n2:$n1)) + num_archives=$[num_archives*3/2] + mkdir -p $comb_egs_dir/log + cp {$sup_egs_dir,$comb_egs_dir}/train_diagnostic.cegs + cp {$sup_egs_dir,$comb_egs_dir}/valid_diagnostic.cegs + cp {$sup_egs_dir,$comb_egs_dir}/combine.cegs + cp {$sup_egs_dir,$comb_egs_dir}/cmvn_opts + cp -r $sup_egs_dir/info $comb_egs_dir + echo $num_archives > $comb_egs_dir/info/num_archives + cat {$sup_egs_dir,$unsup_egs_dir}/info/num_frames | awk '{s+=$1} END{print s}' > $comb_egs_dir/info/num_frames + cat {$sup_egs_dir,$unsup_egs_dir}/info/egs_per_archive | awk '{s+=$1} END{print s}' > $comb_egs_dir/info/egs_per_archive + out_egs_list= + egs_list= + for n in $(seq $num_archives); do + [ -f $sup_egs_dir/cegs.$n.ark ] && egs_list="$egs_list $sup_egs_dir/cegs.$n.ark" + [ -f $unsup_egs_dir/cegs.$n.ark ] && egs_list="$egs_list $unsup_egs_dir/cegs.$n.ark" + out_egs_list="$out_egs_list ark:$comb_egs_dir/cegs.$n.ark" + done + srand=0 + $decode_cmd $comb_egs_dir/log/combine.log \ + nnet3-chain-copy-egs "ark:cat $egs_list|" $out_egs_list +fi + +if [ $stage -le 3 ]; then + echo "$0: training on the supervised+unsupervised subset" + # the train-set and gmm do not matter as we are providing the egs + local/chain/run_tdnn.sh --stage 12 --remove-egs false --train-set $supervised_set \ + --nnet3-affix $nnet3_affix \ + --tdnn-affix ${tdnn_affix}${decode_affix}${egs_affix}${comb_affix} \ + --common-egs-dir $comb_egs_dir $train_combined_opts +fi diff --git a/egs/fisher_english/s5/local/chain/run_tdnn.sh b/egs/fisher_english/s5/local/chain/run_tdnn.sh new file mode 100755 index 00000000000..ce9cf0d1d1d --- /dev/null +++ b/egs/fisher_english/s5/local/chain/run_tdnn.sh @@ -0,0 +1,168 @@ +#!/bin/bash +set -e + +# Based on run_tdnn_7b.sh in the fisher swbd recipe + +# configs for 'chain' +stage=0 +tdnn_affix=7b +train_stage=-10 +get_egs_stage=-10 +decode_iter= +train_set=train +tree_affix= +nnet3_affix= +gmm=tri5a + +# training options +num_epochs=4 +remove_egs=false +common_egs_dir= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate -1 \ + --cmd "$train_cmd" 11000 $build_tree_train_data_dir $lang $build_tree_ali_dir $treedir || exit 1; +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs"; + + # create the config files for nnet initialization + steps/nnet3/tdnn/make_configs.py \ + --self-repair-scale-nonlinearity 0.00001 \ + --feat-dir $train_data_dir \ + --ivector-dir $train_ivector_dir \ + --tree-dir $treedir \ + --relu-dim 725 \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0 0" \ + --use-presoftmax-prior-scale false \ + --xent-regularize 0.1 \ + --xent-separate-forward-affine true \ + --include-log-softmax false \ + --final-layer-normalize-target 0.5 \ + $dir/configs || exit 1; +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$common_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize 0.1 \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch 128 \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $dir/graph +fi + +decode_suff= +graph_dir=$dir/graph +if [ $stage -le 15 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}${decode_suff} || exit 1; + ) & + done +fi +wait; +exit 0; diff --git a/egs/fisher_english/s5/local/nnet3/run_ivector_common.sh b/egs/fisher_english/s5/local/nnet3/run_ivector_common.sh new file mode 100755 index 00000000000..6505381b03f --- /dev/null +++ b/egs/fisher_english/s5/local/nnet3/run_ivector_common.sh @@ -0,0 +1,147 @@ +#!/bin/bash + +. ./cmd.sh +set -e +stage=1 +generate_alignments=true # false if doing chain training +speed_perturb=true +train_set=train + +lda_train_set=train_100k +nnet3_affix= +gmm=tri2_ali # should also contain alignments for $lda_train_set + +. ./path.sh +. ./utils/parse_options.sh + +gmm_dir=exp/$gmm + +# perturbed data preparation +if [ "$speed_perturb" == "true" ]; then + if [ $stage -le 1 ]; then + # Although the nnet will be trained by high resolution data, we still have + # to perturb the normal data to get the alignments. + # _sp stands for speed-perturbed + + for datadir in ${train_set}; do + utils/perturb_data_dir_speed.sh 0.9 data/${datadir} data/temp1 + utils/perturb_data_dir_speed.sh 1.1 data/${datadir} data/temp2 + utils/combine_data.sh data/${datadir}_tmp data/temp1 data/temp2 + utils/validate_data_dir.sh --no-feats data/${datadir}_tmp + rm -r data/temp1 data/temp2 + + mfccdir=mfcc_perturbed + steps/make_mfcc.sh --cmd "$train_cmd" --nj 50 \ + data/${datadir}_tmp exp/make_mfcc/${datadir}_tmp $mfccdir || exit 1; + steps/compute_cmvn_stats.sh data/${datadir}_tmp exp/make_mfcc/${datadir}_tmp $mfccdir || exit 1; + utils/fix_data_dir.sh data/${datadir}_tmp + + utils/copy_data_dir.sh --spk-prefix sp1.0- --utt-prefix sp1.0- data/${datadir} data/temp0 + utils/combine_data.sh data/${datadir}_sp data/${datadir}_tmp data/temp0 + utils/fix_data_dir.sh data/${datadir}_sp + rm -r data/temp0 data/${datadir}_tmp + done + fi + + if [ $stage -le 2 ] && [ "$generate_alignments" == "true" ]; then + #obtain the alignment of the perturbed data + steps/align_fmllr.sh --nj 100 --cmd "$train_cmd" \ + data/${train_set}_sp data/lang exp/tri5a exp/tri5a_ali_${train_set}_sp || exit 1 + fi + train_set=${train_set}_sp +fi + +if [ $stage -le 3 ]; then + mfccdir=mfcc_hires + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then + date=$(date +'%m_%d_%H_%M') + utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/egs/fisher_english-$date/s5b/$mfccdir/storage $mfccdir/storage + fi + + # the 100k directory is copied seperately, as + # we want to use exp/tri2_ali for lda_mllt training + # the main train directory might be speed_perturbed + for dataset in $train_set $lda_train_set; do + utils/copy_data_dir.sh data/$dataset data/${dataset}_hires + + # scale the waveforms, this is useful as we don't use CMVN + data_dir=data/${dataset}_hires + cat $data_dir/wav.scp | python -c " +import sys, os, subprocess, re, random +scale_low = 1.0/8 +scale_high = 2.0 +for line in sys.stdin.readlines(): + if len(line.strip()) == 0: + continue + print '{0} sox --vol {1} -t wav - -t wav - |'.format(line.strip(), random.uniform(scale_low, scale_high)) +"| sort -k1,1 -u > $data_dir/wav.scp_scaled || exit 1; + mv $data_dir/wav.scp_scaled $data_dir/wav.scp + + steps/make_mfcc.sh --nj 70 --mfcc-config conf/mfcc_hires.conf \ + --cmd "$train_cmd" data/${dataset}_hires exp/make_hires/$dataset $mfccdir; + steps/compute_cmvn_stats.sh data/${dataset}_hires exp/make_hires/${dataset} $mfccdir; + + # Remove the small number of utterances that couldn't be extracted for some + # reason (e.g. too short; no such file). + utils/fix_data_dir.sh data/${dataset}_hires; + done + + for dataset in test dev; do + # Create MFCCs for the eval set + utils/copy_data_dir.sh data/$dataset data/${dataset}_hires + steps/make_mfcc.sh --cmd "$train_cmd" --nj 10 --mfcc-config conf/mfcc_hires.conf \ + data/${dataset}_hires exp/make_hires/$dataset $mfccdir; + steps/compute_cmvn_stats.sh data/${dataset}_hires exp/make_hires/$dataset $mfccdir; + utils/fix_data_dir.sh data/${dataset}_hires # remove segments with problems + done + + # Take the first 30k utterances (about 1/8th of the data) this will be used + # for the diagubm training + utils/subset_data_dir.sh --first data/${train_set}_hires 30000 data/${train_set}_30k_hires + utils/data/remove_dup_utts.sh 200 data/${train_set}_30k_hires data/${train_set}_30k_nodup_hires # 33hr +fi + +# ivector extractor training +if [ $stage -le 4 ]; then + # We need to build a small system just because we need the LDA+MLLT transform + # to train the diag-UBM on top of. We use --num-iters 13 because after we get + # the transform (12th iter is the last), any further training is pointless. + # this decision is based on fisher_english + steps/train_lda_mllt.sh --cmd "$train_cmd" --num-iters 13 \ + --splice-opts "--left-context=3 --right-context=3" \ + 5500 90000 data/${lda_train_set}_hires \ + data/lang $gmm_dir exp/nnet3${nnet3_affix}/tri3a +fi + +if [ $stage -le 5 ]; then + # To train a diagonal UBM we don't need very much data, so use the smallest subset. + steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 --num-frames 200000 \ + data/${train_set}_30k_nodup_hires 512 exp/nnet3${nnet3_affix}/tri3a exp/nnet3${nnet3_affix}/diag_ubm +fi + +if [ $stage -le 6 ]; then + # iVector extractors can be sensitive to the amount of data, but this one has a + # fairly small dim (defaults to 100) so we don't use all of it, we use just the + # 100k subset (just under half the data). + steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \ + data/${lda_train_set}_hires exp/nnet3${nnet3_affix}/diag_ubm exp/nnet3${nnet3_affix}/extractor || exit 1; +fi + +if [ $stage -le 7 ]; then + # We extract iVectors on all the ${train_set} data, which will be what we + # train the system on. + + # having a larger number of speakers is helpful for generalization, and to + # handle per-utterance decoding well (iVector starts at zero). + steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2 data/${train_set}_hires data/${train_set}_max2_hires + + steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \ + data/${train_set}_max2_hires exp/nnet3${nnet3_affix}/extractor exp/nnet3${nnet3_affix}/ivectors_${train_set}_hires || exit 1; + + for dataset in test dev; do + steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \ + data/${dataset}_hires exp/nnet3${nnet3_affix}/extractor exp/nnet3${nnet3_affix}/ivectors_${dataset}_hires || exit 1; + done +fi + +exit 0; diff --git a/egs/fisher_english/s5/local/nnet3/run_tdnn.sh b/egs/fisher_english/s5/local/nnet3/run_tdnn.sh new file mode 100644 index 00000000000..f055b853b61 --- /dev/null +++ b/egs/fisher_english/s5/local/nnet3/run_tdnn.sh @@ -0,0 +1,98 @@ +#!/bin/bash + +# This script is not tested. + +# this is the standard "tdnn" system, built in nnet3; it's what we used to +# call multi-splice. + +. ./cmd.sh + + +# At this script level we don't support not running on GPU, as it would be painfully slow. +# If you want to run without GPU you'd have to call train_tdnn.sh with --gpu false, +# --num-threads 16 and --minibatch-size 128. + +stage=0 +affix= +train_stage=-10 +common_egs_dir= +reporting_email= +remove_egs=true + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat < Date: Wed, 21 Jun 2017 23:01:21 -0400 Subject: [PATCH 011/174] SMBR chain --- src/chain/Makefile | 3 +- src/chain/chain-denominator-smbr.cc | 292 ++++++++++++++++++---------- src/chain/chain-denominator-smbr.h | 102 ++++++++-- src/chain/chain-denominator.h | 2 - src/chain/chain-kernels-ansi.h | 30 +++ src/chain/chain-kernels.cu | 246 +++++++++++++++++++++++ src/chain/chain-training.cc | 78 ++++++++ src/chain/chain-training.h | 46 ++++- src/cudamatrix/cu-kernels.cu | 17 ++ src/cudamatrix/cu-kernels.h | 8 + src/cudamatrix/cu-vector.cc | 19 ++ src/cudamatrix/cu-vector.h | 3 + src/nnet3/nnet-chain-diagnostics.cc | 18 +- src/nnet3/nnet-chain-training.cc | 18 +- 14 files changed, 756 insertions(+), 126 deletions(-) diff --git a/src/chain/Makefile b/src/chain/Makefile index 288cd963008..80bf66d4cb8 100644 --- a/src/chain/Makefile +++ b/src/chain/Makefile @@ -8,7 +8,8 @@ LDLIBS += $(CUDA_LDLIBS) TESTFILES = chain-supervision-test language-model-test OBJFILES = chain-supervision.o chain-numerator.o chain-den-graph.o \ - language-model.o chain-denominator.o chain-training.o + language-model.o chain-denominator.o chain-training.o \ + chain-denominator-smbr.o ifeq ($(CUDA), true) OBJFILES += chain-kernels.o endif diff --git a/src/chain/chain-denominator-smbr.cc b/src/chain/chain-denominator-smbr.cc index 6ed1a4e18da..752f94af504 100644 --- a/src/chain/chain-denominator-smbr.cc +++ b/src/chain/chain-denominator-smbr.cc @@ -1,7 +1,6 @@ // chain/chain-denominator-smbr.cc // Copyright 2015 Johns Hopkins University (author: Daniel Povey) -// 2016 Vimal Manohar // See ../../COPYING for clarification regarding multiple authors // @@ -29,12 +28,14 @@ DenominatorSmbrComputation::DenominatorSmbrComputation( const ChainTrainingOptions &opts, const DenominatorGraph &den_graph, int32 num_sequences, - const CuMatrixBase &nnet_output): + const CuMatrixBase &nnet_output, + const CuMatrixBase &num_posteriors): opts_(opts), den_graph_(den_graph), num_sequences_(num_sequences), frames_per_sequence_(nnet_output.NumRows() / num_sequences_), exp_nnet_output_transposed_(nnet_output, kTrans), + num_posteriors_(num_posteriors), nnet_output_deriv_transposed_( exp_nnet_output_transposed_.NumRows(), std::min(exp_nnet_output_transposed_.NumCols(), @@ -43,23 +44,23 @@ DenominatorSmbrComputation::DenominatorSmbrComputation( alpha_(frames_per_sequence_ + 1, den_graph_.NumStates() * num_sequences_ + num_sequences_, kUndefined), + alpha_smbr_(frames_per_sequence_ + 1, + den_graph_.NumStates() * num_sequences_ + num_sequences_, + kUndefined), beta_(2, den_graph_.NumStates() * num_sequences_ + num_sequences_, kUndefined), - tot_prob_(num_sequences_, kUndefined), - tot_log_prob_(num_sequences_, kUndefined), - log_correction_term_(num_sequences_, kUndefined), - ok_(true), - alpha_smbr_(frames_per_sequence_ + 1, - den_graph_.NumStates() * num_sequences_ + num_sequences_, - kUndefined), beta_smbr_(2, den_graph_.NumStates() * num_sequences_ + num_sequences_, - kUndefined), - tot_objf_(num_sequences_, kUndefined) { + kUndefined), + tot_prob_(num_sequences_, kUndefined), + tot_smbr_(num_sequences_, kUndefined), + ok_(true) { KALDI_ASSERT(opts_.leaky_hmm_coefficient > 0.0 && opts_.leaky_hmm_coefficient < 1.0); // make sure the alpha sums and beta sums are zeroed. alpha_.ColRange(den_graph_.NumStates() * num_sequences_, num_sequences_).SetZero(); + beta_.ColRange(den_graph_.NumStates() * num_sequences_, + num_sequences_).SetZero(); alpha_smbr_.ColRange(den_graph_.NumStates() * num_sequences_, num_sequences_).SetZero(); beta_smbr_.ColRange(den_graph_.NumStates() * num_sequences_, @@ -70,6 +71,22 @@ DenominatorSmbrComputation::DenominatorSmbrComputation( } +void DenominatorSmbrComputation::AlphaFirstFrame() { + // dim == num_hmm_states_ * num_sequences_. + BaseFloat *first_frame_alpha = alpha_.RowData(0); + // create a 'fake matrix' - view this row as a matrix. + // initializer takes [pointer, num-rows, num-cols, stride]. + CuSubMatrix alpha_mat(first_frame_alpha, + den_graph_.NumStates(), + num_sequences_, + num_sequences_); + // TODO (possible): It would be more efficient here if we implemented a + // CopyColsFromVec function in class CuMatrix. + alpha_mat.SetZero(); + alpha_mat.AddVecToCols(1.0, den_graph_.InitialProbs(), 0.0); +} + + void DenominatorSmbrComputation::AlphaSmbrFirstFrame() { // dim == num_hmm_states_ * num_sequences_. BaseFloat *first_frame_alpha_smbr = alpha_smbr_.RowData(0); @@ -79,19 +96,17 @@ void DenominatorSmbrComputation::AlphaSmbrFirstFrame() { den_graph_.NumStates(), num_sequences_, num_sequences_); - // TODO (possible): It would be more efficient here if we implemented a - // CopyColsFromVec function in class CuMatrix. alpha_smbr_mat.SetZero(); } -// the alpha computation for some 0 < t <= num_time_steps_. +// the alpha smbr computation for some 0 < t <= num_time_steps_. void DenominatorSmbrComputation::AlphaSmbrGeneralFrame(int32 t) { KALDI_ASSERT(t > 0 && t <= frames_per_sequence_); BaseFloat *this_alpha = alpha_.RowData(t); BaseFloat *this_alpha_smbr = alpha_smbr_.RowData(t); const BaseFloat *prev_alpha_dash = alpha_.RowData(t - 1); - const BaseFloat *prev_prev_alpha_dash = (t > 1 ? alpha_.RowData(t - 2) : NULL); + const BaseFloat *prev_alpha_smbr = alpha_smbr_.RowData(t - 1); const Int32Pair *backward_transitions = den_graph_.BackwardTransitions(); const DenominatorGraphTransition *transitions = den_graph_.Transitions(); int32 num_pdfs = exp_nnet_output_transposed_.NumRows(), @@ -105,22 +120,51 @@ void DenominatorSmbrComputation::AlphaSmbrGeneralFrame(int32 t) { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { - Timer tim; + CuTimer tim; dim3 dimBlock(std::min(CU1DBLOCK, num_sequences), 1, 1); dim3 dimGrid(n_blocks(num_sequences, dimBlock.x), num_hmm_states, 1); - cuda_chain_hmm_forward(dimGrid, dimBlock, backward_transitions, transitions, - num_sequences, prob_data, probs.Stride(), - prev_alpha_dash, this_alpha); - - CU_SAFE_CALL(cudaGetLastError()); - CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); + while (1) { + if (dimGrid.y > 65535) // the hardware doesn't allow more than this. + dimGrid.y = 65535; + cuda_chain_smbr_hmm_forward(dimGrid, dimBlock, + backward_transitions, transitions, + num_sequences, den_graph_.NumStates(), + prob_data, probs.Stride(), + num_posteriors_.Row(t).Data(), + prev_alpha_dash, prev_alpha_smbr, + this_alpha, this_alpha_smbr); + CU_SAFE_CALL(cudaGetLastError()); + if (dimGrid.y == num_hmm_states) { + break; // this is the normal case. + } else { + // We reach this code only in the unusual case where num_hmm_states > + // 65535. We can compute the alphas for the remaining HMM states by + // moving some of the array pointers and making the call again. + backward_transitions += dimGrid.y; + this_alpha += dimGrid.y * num_sequences; + this_alpha_smbr += dimGrid.y * num_sequences; + num_hmm_states -= dimGrid.y; + dimGrid.y = num_hmm_states; + } + } + CuDevice::Instantiate().AccuProfile(__func__, tim); } else #endif { int32 prob_stride = probs.Stride(); for (int32 h = 0; h < num_hmm_states; h++) { for (int32 s = 0; s < num_sequences; s++) { + // Let arbitrary_scale be the inverse of the alpha-sum value that we + // store in the same place we'd store the alpha for the state numbered + // 'num_hmm_states'. We multiply this into all the + // transition-probabilities from the previous frame to this frame, in + // both the forward and backward passes, in order to keep the alphas in + // a good numeric range. This won't affect the posteriors, but when + // computing the total likelihood we'll need to compensate for it later + // on. + BaseFloat arbitrary_scale = + 1.0 / prev_alpha_dash[num_hmm_states * num_sequences + s]; double this_tot_alpha = 0.0; double this_tot_alpha_smbr = 0.0; const DenominatorGraphTransition @@ -129,34 +173,72 @@ void DenominatorSmbrComputation::AlphaSmbrGeneralFrame(int32 t) { for (; trans_iter != trans_end; ++trans_iter) { BaseFloat transition_prob = trans_iter->transition_prob; int32 pdf_id = trans_iter->pdf_id, - prev_hmm_state = trans_iter->hmm_state; + prev_hmm_state = trans_iter->hmm_state; BaseFloat prob = prob_data[pdf_id * prob_stride + s], this_prev_alpha = prev_alpha_dash[prev_hmm_state * num_sequences + s], - this_prev_alpha_smbr = prev_alpha_dash_smbr[prev_hmm_state * num_sequences + s], - + this_prev_alpha_smbr = prev_alpha_smbr[prev_hmm_state * num_sequences + s]; this_tot_alpha += this_prev_alpha * transition_prob * prob; - this_tot_alpha_smbr += this_prev_alpha_smbr * transition_prob * prob - + (pdf_id == ref_pdf_id ? this_prev_alpha * prev_prev_alpha_dash[num_hmm_states * num_sequences + s] * transition_prob * prob : 0.0); + KALDI_ASSERT(num_posteriors_(t, pdf_id) > -1e-20); + this_tot_alpha_smbr += + this_prev_alpha_smbr * this_prev_alpha / arbitrary_scale + + transition_prob * prob * num_posteriors_(t, pdf_id); } - // Let arbitrary_scale be the inverse of the alpha-sum value that we - // store in the same place we'd store the alpha for the state numbered - // 'num_hmm_states'. We multiply this into all the - // transition-probabilities from the previous frame to this frame, in - // both the forward and backward passes, in order to keep the alphas in - // a good numeric range. This won't affect the posteriors, but when - // computing the total likelihood we'll need to compensate for it later - // on. - BaseFloat arbitrary_scale = - 1.0 / prev_alpha_dash[num_hmm_states * num_sequences + s]; KALDI_ASSERT(this_tot_alpha - this_tot_alpha == 0); - KALDI_ASSERT(this_tot_alpha_smbr - this_tot_alpha_smbr == 0); this_alpha[h * num_sequences + s] = this_tot_alpha * arbitrary_scale; + this_alpha_smbr[h * num_sequences + s] = this_tot_alpha_smbr / this_tot_alpha; } } } } + +void DenominatorSmbrComputation::AlphaDash(int32 t) { + BaseFloat *this_alpha = alpha_.RowData(t); + + // create a 'fake matrix' for the regular alphas- view this row as a matrix. + // initializer takes [pointer, num-rows, num-cols, stride]. + CuSubMatrix alpha_mat(this_alpha, + den_graph_.NumStates(), + num_sequences_, + num_sequences_); + + // the alpha-dash is the sum of alpha over all states. + CuSubVector alpha_sum_vec(this_alpha + + den_graph_.NumStates() * num_sequences_, + num_sequences_); + alpha_sum_vec.AddRowSumMat(1.0, alpha_mat, 0.0); + + alpha_mat.AddVecVec(opts_.leaky_hmm_coefficient, + den_graph_.InitialProbs(), + alpha_sum_vec); + // it's now alpha-dash. +} + +// compute beta from beta-dash. +void DenominatorSmbrComputation::Beta(int32 t) { + BaseFloat *this_beta_dash = beta_.RowData(t % 2); + // create a 'fake matrix' for the regular beta-dash (which is + // the counterpart of alpha-dash)- view this row as a matrix. + // initializer takes [pointer, num-rows, num-cols, stride]. + CuSubMatrix beta_dash_mat(this_beta_dash, + den_graph_.NumStates(), + num_sequences_, + num_sequences_); + // making the t index implicit, the beta-dash-sum for each sequence is the sum + // over all states i of beta_i * opts_.leaky_hmm_coefficient * initial_prob_i. + CuSubVector beta_dash_sum_vec( + this_beta_dash + den_graph_.NumStates() * num_sequences_, + num_sequences_); + beta_dash_sum_vec.AddMatVec(opts_.leaky_hmm_coefficient, beta_dash_mat, + kTrans, den_graph_.InitialProbs(), 0.0); + // we are computing beta in place. After the following, beta-dash-mat + // will contain the actual beta (i.e. the counterpart of alpha), + // not the beta-dash. + beta_dash_mat.AddVecToRows(1.0, beta_dash_sum_vec); +} + BaseFloat DenominatorSmbrComputation::ForwardSmbr() { + AlphaFirstFrame(); AlphaSmbrFirstFrame(); AlphaDash(0); for (int32 t = 1; t <= frames_per_sequence_; t++) { @@ -168,44 +250,26 @@ BaseFloat DenominatorSmbrComputation::ForwardSmbr() { BaseFloat DenominatorSmbrComputation::ComputeTotObjf() { tot_prob_.Resize(num_sequences_); + tot_smbr_.Resize(num_sequences_); // View the last alpha-dash as a matrix of size num-hmm-states by num-sequences. CuSubMatrix last_alpha_dash( alpha_.RowData(frames_per_sequence_), den_graph_.NumStates(), num_sequences_, num_sequences_); - CuMatrix last_alpha_smbr_dash( + CuSubMatrix last_alpha_smbr( alpha_smbr_.RowData(frames_per_sequence_), den_graph_.NumStates(), num_sequences_, num_sequences_); + // TODO: Make this vector multiplication tot_prob_.AddRowSumMat(1.0, last_alpha_dash, 0.0); - // we should probably add an ApplyLog() function that takes a vector argument. - tot_log_prob_ = tot_prob_; - tot_log_prob_.ApplyLog(); - BaseFloat tot_log_prob = tot_log_prob_.Sum(); - - // We now have to add something for the arbitrary scaling factor. [note: the - // purpose of the arbitrary scaling factors was to keep things in a good - // floating-point range] - // The inverses of all the tot-alpha quantities, for t = 0 - // ... frames_per_sequence_ - 1, were included as the 'arbitrary factors' in - // the transition-probs, so we need to multiply them all together (not - // inversed) and add them as a correction term to the total log-likes. - // These tot-alpha quantities were stored in the same place that we would - // have stored the HMM-state numbered 'num_hmm_states'. - int32 num_hmm_states = den_graph_.NumStates(); - CuSubMatrix inv_arbitrary_scales( - alpha_, 0, frames_per_sequence_, - num_sequences_ * num_hmm_states, num_sequences_); - CuMatrix log_inv_arbitrary_scales( - inv_arbitrary_scales); - log_inv_arbitrary_scales.ApplyLog(); - BaseFloat log_inv_arbitrary_scales_product = - log_inv_arbitrary_scales.Sum(); - BaseFloat tot_objf = last_alpha_smbr_dash.Sum(); - return tot_objf; + last_alpha_smbr.MulElements(last_alpha_dash); + tot_smbr_.AddRowSumMat(1.0, last_alpha_smbr, 0.0); + tot_smbr_.DivElements(tot_prob_); + + return tot_smbr_.Sum(); } @@ -213,10 +277,10 @@ BaseFloat DenominatorSmbrComputation::ComputeTotObjf() { bool DenominatorSmbrComputation::BackwardSmbr( BaseFloat deriv_weight, CuMatrixBase *nnet_output_deriv) { - BetaSmbrDashLastFrame(); - BetaSmbr(frames_per_sequence_); + BetaDashLastFrame(); + Beta(frames_per_sequence_); for (int32 t = frames_per_sequence_ - 1; t >= 0; t--) { - BetaSmbrDashGeneralFrame(t); + BetaSmbrGeneralFrame(t); if (GetVerboseLevel() >= 1 || t == 0) BetaSmbrGeneralFrameDebug(t); Beta(t); @@ -242,14 +306,13 @@ bool DenominatorSmbrComputation::BackwardSmbr( return ok_; } -void DenominatorSmbrComputation::BetaSmbrDashLastFrame() { +void DenominatorSmbrComputation::BetaDashLastFrame() { // sets up the beta-dash quantity on the last frame (frame == // frames_per_sequence_). Note that the betas we use here contain a // 1/(tot-prob) factor in order to simplify the backprop. int32 t = frames_per_sequence_; BaseFloat *last_frame_beta_dash = beta_.RowData(t % 2); - BaseFloat *last_frame_beta_smbr_dash = beta_smbr_.RowData(t % 2); // create a 'fake matrix' - view this row as a matrix. CuSubMatrix beta_dash_mat(last_frame_beta_dash, @@ -261,14 +324,18 @@ void DenominatorSmbrComputation::BetaSmbrDashLastFrame() { // the beta values at the end of the file only vary with the sequence-index, // not with the HMM-index. We treat all states as having a final-prob of one. beta_dash_mat.CopyRowsFromVec(inv_tot_prob); - CuSubMatrix beta_smbr_dash_mat(last_frame_beta_smbr_dash, - den_graph_.NumStates(), - num_sequences_, - num_sequences_); - beta_smbr_dash_mat.SetZero(); } -void DenominatorSmbrComputation::BetaSmbrDashGeneralFrame(int32 t) { +void DenominatorSmbrComputation::BetaSmbrLastFrame() { + // sets up the beta-dash quantity on the last frame (frame == + // frames_per_sequence_). Note that the betas we use here contain a + // 1/(tot-prob) factor in order to simplify the backprop. + + int32 t = frames_per_sequence_; + beta_smbr_.Row(t % 2).SetZero(); +} + +void DenominatorSmbrComputation::BetaSmbrGeneralFrame(int32 t) { KALDI_ASSERT(t >= 0 && t < frames_per_sequence_); int32 num_pdfs = exp_nnet_output_transposed_.NumRows(); // t_wrapped gives us the time-index we use when indexing @@ -277,11 +344,11 @@ void DenominatorSmbrComputation::BetaSmbrDashGeneralFrame(int32 t) { // non-transposed output whenever we finish a chunk. int32 t_wrapped = t % static_cast(kMaxDerivTimeSteps); const BaseFloat *this_alpha_dash = alpha_.RowData(t), - *this_alpha_smbr_dash = alpha_smbr_.RowData(t), + *this_alpha_smbr = alpha_smbr_.RowData(t), *next_beta = beta_.RowData((t + 1) % 2), - *next_betas_smbr = beta_smbr.RowData((t + 1) % 2); - BaseFloat *this_beta_dash = beta_.RowData(t % 2); - BaseFloat *this_beta_smbr_dash = beta_smbr_.RowData(t % 2); + *next_beta_smbr = beta_smbr_.RowData((t + 1) % 2); + BaseFloat *this_beta_dash = beta_.RowData(t % 2), + *this_beta_smbr = beta_smbr_.RowData(t % 2); const Int32Pair *forward_transitions = den_graph_.ForwardTransitions(); const DenominatorGraphTransition *transitions = den_graph_.Transitions(); // 'probs' is the matrix of pseudo-likelihoods for frame t. @@ -295,15 +362,36 @@ void DenominatorSmbrComputation::BetaSmbrDashGeneralFrame(int32 t) { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { - Timer tim; + CuTimer tim; dim3 dimBlock(std::min(CU1DBLOCK, num_sequences), 1, 1); dim3 dimGrid(n_blocks(num_sequences, dimBlock.x), num_hmm_states, 1); - cuda_chain_hmm_backward(dimGrid, dimBlock, forward_transitions, transitions, - num_sequences, probs.Data(), probs.Stride(), - this_alpha_dash, next_beta, this_beta_dash, - log_prob_deriv.Data(), log_prob_deriv.Stride()); - CU_SAFE_CALL(cudaGetLastError()); - CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); + while (1) { + if (dimGrid.y > 65535) // the hardware doesn't allow more than this. + dimGrid.y = 65535; + cuda_chain_smbr_hmm_backward( + dimGrid, dimBlock, forward_transitions, transitions, + num_sequences, num_hmm_states, + probs.Data(), probs.Stride(), + this_alpha_dash, this_alpha_smbr, + next_beta, next_beta_smbr, + this_beta_dash, this_beta_smbr, + log_prob_deriv.Data(), log_prob_deriv.Stride()); + CU_SAFE_CALL(cudaGetLastError()); + if (dimGrid.y == num_hmm_states) { + break; // this is the normal case. + } else { + // We reach this code only in the unusual case where num_hmm_states > + // 65535. We can compute the betas (and log-prob derivatives) for the + // remaining HMM states by moving some of the array pointers and making + // the call again. + forward_transitions += dimGrid.y; + this_alpha_dash += dimGrid.y * num_sequences; + this_beta_dash += dimGrid.y * num_sequences; + num_hmm_states -= dimGrid.y; + dimGrid.y = num_hmm_states; + } + } + CuDevice::Instantiate().AccuProfile(__func__, tim); } else #endif { @@ -314,10 +402,10 @@ void DenominatorSmbrComputation::BetaSmbrDashGeneralFrame(int32 t) { for (int32 h = 0; h < num_hmm_states; h++) { for (int32 s = 0; s < num_sequences; s++) { BaseFloat this_alpha_dash_prob = this_alpha_dash[h * num_sequences + s], - this_alpha_smbr_dash_prob = this_alpha_smbr_dash[h * num_sequences + s], + this_alpha_smbr_i = this_alpha_smbr[h * num_sequences + s], inv_arbitrary_scale = this_alpha_dash[num_hmm_states * num_sequences + s]; - double tot_variable_factor = 0.0, tot_smbr_variable_factor = 0.0; + double tot_variable_factor = 0.0, beta_smbr = 0.0; BaseFloat occupation_factor = this_alpha_dash_prob / inv_arbitrary_scale; const DenominatorGraphTransition @@ -327,30 +415,35 @@ void DenominatorSmbrComputation::BetaSmbrDashGeneralFrame(int32 t) { BaseFloat transition_prob = trans_iter->transition_prob; int32 pdf_id = trans_iter->pdf_id, next_hmm_state = trans_iter->hmm_state; - BaseFloat variable_factor = transition_prob * - next_beta[next_hmm_state * num_sequences + s] * + BaseFloat next_beta_j = next_beta[next_hmm_state + num_sequences + s], + next_beta_smbr_j = next_beta_smbr[next_hmm_state + num_sequences + s]; + BaseFloat variable_factor = transition_prob * next_beta_j * prob_data[pdf_id * prob_stride + s]; - BaseFloat smbr_variable_factor = (pdf_id == ref_pdf_id ? variable_factor : 0.0) - + transition_prob * prob_data[pdf_id * prob_stride + s] * next_beta_smbr[next_hmm_state * num_sequences + s]; + beta_smbr += next_beta_smbr_j * next_beta_j + + prob_data[pdf_id * prob_stride + s] / inv_arbitrary_scale + * transition_prob * num_posteriors_(t, pdf_id); tot_variable_factor += variable_factor; - BaseFloat occupation_prob = variable_factor * occupation_factor; - gamma_smbr = smbr_varaible_factor * occupation_factor; - log_prob_deriv_data[pdf_id * deriv_stride + s] += occupation_prob; + double this_gamma_r = occupation_factor * next_beta_j + * transition_prob * (this_alpha_smbr_i + num_posteriors_(t, pdf_id) + + next_beta_smbr_j - tot_smbr_(s)); + log_prob_deriv_data[pdf_id * deriv_stride + s] += this_gamma_r; } this_beta_dash[h * num_sequences + s] = tot_variable_factor / inv_arbitrary_scale; - this_beta_dash_smbr[h * num_sequences + s] = - tot_smbr_variable_factor / inv_arbitrary_scale; + this_beta_smbr[h * num_sequences + s] = beta_smbr / this_beta_dash[h * num_sequences + s]; + } } } } -void DenominatorSmbrComputation::BetaGeneralFrameDebug(int32 t) { +void DenominatorSmbrComputation::BetaSmbrGeneralFrameDebug(int32 t) { BaseFloat num_hmm_states = den_graph_.NumStates(), alpha_beta_size = num_hmm_states * num_sequences_; CuSubVector this_alpha_dash(alpha_.RowData(t), alpha_beta_size), - this_beta_dash(beta_.RowData(t % 2), alpha_beta_size); + this_beta_dash(beta_.RowData(t % 2), alpha_beta_size), + this_alpha_smbr(alpha_smbr_.RowData(t), alpha_beta_size), + this_beta_smbr(beta_smbr_.RowData(t % 2), alpha_beta_size); int32 t_wrapped = t % static_cast(kMaxDerivTimeSteps), num_pdfs = exp_nnet_output_transposed_.NumRows(); CuSubMatrix this_log_prob_deriv( @@ -385,4 +478,3 @@ void DenominatorSmbrComputation::BetaGeneralFrameDebug(int32 t) { } // namespace chain } // namespace kaldi - diff --git a/src/chain/chain-denominator-smbr.h b/src/chain/chain-denominator-smbr.h index c34a4d1a4eb..663f9387d23 100644 --- a/src/chain/chain-denominator-smbr.h +++ b/src/chain/chain-denominator-smbr.h @@ -23,11 +23,25 @@ #ifndef KALDI_CHAIN_CHAIN_DENOMINATOR_SMBR_H_ #define KALDI_CHAIN_CHAIN_DENOMINATOR_SMBR_H_ -#include "chain/chain-denominator.h" +#include +#include + +#include "base/kaldi-common.h" +#include "util/common-utils.h" +#include "fstext/fstext-lib.h" +#include "tree/context-dep.h" +#include "lat/kaldi-lattice.h" +#include "matrix/kaldi-matrix.h" +#include "hmm/transition-model.h" +#include "cudamatrix/cu-matrix.h" +#include "cudamatrix/cu-array.h" +#include "chain/chain-den-graph.h" +#include "chain/chain-training.h" namespace kaldi { namespace chain { + /* This extended comment describes how we implement forward-backward without log and without overflow, and also the leaky-HMM idea. @@ -95,8 +109,10 @@ namespace chain { beta_r(t, i) = 0 for (j, p, n) in foll(i): # note: j is following-state. beta(t, i) += x(t, n) * beta(t+1, j) * p. - beta_r(t, i) += (ref_pdf == pdf ? x(t, n) * beta(t+1, j) * p : 0) + x(t, n) * p * beta_r(t+1, j). + beta_r(t, i) += beta(t+1, j) * beta_r(t+1, j) + x(t, n) * p * (ref_pdf == pdf ? 1.0 : 0) gamma(t, n) += alpha(t, i) * x(t, n) * beta(t+1, j) * p. + gamma_r(t, n) += alpha(t, i) * x(t, n) * beta(t+1, j) * p * (alpha_r(t, i) + (ref_pdf == pdf ? 1.0 : 0) + beta_r(t+1, j) - tot_objf) + beta_r(t, i) /= beta(t, i) ** Version 2 of the computation (renormalized version) ** @@ -149,11 +165,16 @@ namespace chain { - For 1 <= t <= T, the computation of alpha(t, i) is as before except we use the previous frame's alpha' instead of alpha. That is: alpha(t, i) = 0 + alpha_r(t, i) = 0 for (j, p, n) in pred(i): # note: j is preceding-state. alpha(t, i) += alpha'(t-1, j) * p * x(t-1, n) / tot-alpha(t-1) + alpha_r(t, i) += alpha_r(t-1, j) * alpha'(t-1, j) + x(t-1, n) / tot-alpha(t-1) * p * (ref_pdf == pdf ? 1.0 : 0.0) + alpha_r(t, i) /= alpha(t,i) - total-prob = \sum_i alpha'(T, i) + - total-objf = \sum_i alpha'(T, i) * alpha_r(T, i) / total-prob + The corrected log-prob that we return from the algorithm will be (total-prob + \sum_{t=0}^{T-1} tot-alpha(t)). @@ -167,6 +188,7 @@ namespace chain { derivative w.r.t. tot-alpha. - beta'(T, i) = 1 / total-prob. + - beta_r(T, i) = 0 - for 0 <= t <= T, define tot-beta(t) = leaky-hmm-prob * \sum_i init(i) * beta'(t, i) - for 0 <= t <= T, define beta(t, i) = beta'(t, i) + tot-beta(t). - for 0 <= t < T, we compute beta'(t, i) and update gamma(t, n) as follows: @@ -174,14 +196,17 @@ namespace chain { beta'(t, i) = 0 for (j, p, n) in foll(i): # note: j is following-state. beta'(t, i) += beta(t+1, j) * p * x(t, n) / tot-alpha(t) + beta_r(t, i) += beta(t+1, j) * beta_r(t+1, j) + x(t, n) / tot-alpha(t) * p * (ref_pdf == pdf ? 1.0 : 0) gamma(t, n) += alpha'(t, i) * beta(t+1, j) * p * x(t, n) / tot-alpha(t) + gamma_r(t, n) += alpha'(t, i) * x(t, n) / tot-alpha(t) * beta(t+1, j) * p * (alpha_r(t, i) + (ref_pdf == pdf ? 1.0 : 0.0) + beta_r(t+1, j) - tot_objf) + beta_r(t, i) /= beta(t, i) Note: in the code, the tot-alpha and tot-beta quantities go in the same memory location that the corresponding alpha and beta for state I would go. */ -class DenominatorSmbrComputation : DenominatorComputation { +class DenominatorSmbrComputation { public: /* Constructor. 'nnet_output' is the raw nnet output (which we'll treat as @@ -199,14 +224,15 @@ class DenominatorSmbrComputation : DenominatorComputation { DenominatorSmbrComputation(const ChainTrainingOptions &opts, const DenominatorGraph &den_graph, int32 num_sequences, - const CuMatrixBase &nnet_output); + const CuMatrixBase &nnet_output, + const CuMatrixBase &num_posteriors); - // Does the forward computation, and returns the total negated log-like summed + // Does the forward computation, and returns the total objective summed // over all sequences. You will have to scale this by any supervision // weighting factor, manually. BaseFloat ForwardSmbr(); - // this adds deriv_weight times (the derivative of the log-prob w.r.t. the + // this adds deriv_weight times (the derivative of the objective w.r.t. the // nnet output), to 'nnet_output_deriv'. // returns true if everything seemed OK, false if a failure was detected. bool BackwardSmbr(BaseFloat deriv_weight, @@ -219,37 +245,89 @@ class DenominatorSmbrComputation : DenominatorComputation { // setting it small is that we have to invoke an AddMat kernel more times. enum { kMaxDerivTimeSteps = 8 }; + // sets up the alpha for frame t = 0. + void AlphaFirstFrame(); // sets up the alpha for frame t = 0. void AlphaSmbrFirstFrame(); // the alpha computation for some 0 < t <= num_time_steps_. void AlphaSmbrGeneralFrame(int32 t); // does the 'alpha-dash' computation for time t. this relates to // 'leaky hmm'. - void AlphaSmbrDash(int32 t); + void AlphaDash(int32 t); // done after all the alphas, this function computes and returns the total // smbr objective summed over all the sequences, and sets tot_prob_ (if we're // doing correction) log_correction_term_. Note, this won't be scaled by // 'deriv_scale' (which of course we haven't seen by the time this is called, - // from the Forward() computation). + // from the ForwardSmbr() computation). BaseFloat ComputeTotObjf(); - - void BetaSmbrDashLastFrame(); + void BetaDashLastFrame(); + void BetaSmbrLastFrame(); // beta computation for 0 <= beta < num_time_steps_. - void BetaSmbrDashGeneralFrame(int32 t); + void BetaSmbrGeneralFrame(int32 t); // compute the beta quantity from the beta-dash quantity (relates to leaky hmm). - void BetaSmbr(int32 t); + void Beta(int32 t); // some checking that we can do if debug mode is activated, or on frame zero. // Sets ok_ to false if a bad problem is detected. void BetaSmbrGeneralFrameDebug(int32 t); + const ChainTrainingOptions &opts_; + const DenominatorGraph &den_graph_; + + // number of separate frame sequences + int32 num_sequences_; + // number of frames per sequence. nnet_output_.NumRows() equals + // num_sequences_ * frames_per_sequence. + int32 frames_per_sequence_; + + // The transpose of the exp() of the nnet output (the transpose is more + // convenient for memory locality, and the exp() avoids us having to + // exponentiate in the forward-backward). + // + // The row-index is the pdf-id; and the column index equals (frame_index * + // num_sequences + sequence_index). + CuMatrix exp_nnet_output_transposed_; + + // the numberator posterior probabilities + // This is a matrix of size num_sequences x num_pdfs + CuMatrix num_posteriors_; + + // the derivs w.r.t. the nnet outputs (transposed) + CuMatrix nnet_output_deriv_transposed_; + + // the (temporarily) alpha and (more permanently) alpha-dash probabilities; + // dimension is (frames_per_sequence + 1) by (num-hmm-states * num-sequences + + // num_sequences). Note, they are not logs. The last 'num_sequences' + // columns, where the alpha for the state indexed 'num_hmm_states' would live, + // are for the alpha-sums, which relates to leaky HMM. + CuMatrix alpha_; + + // the analogous alpha quantities for the SMBR objective CuMatrix alpha_smbr_; + // the beta (also beta-dash) probabilities (rolling buffer); dimension is 2 * + // (num-hmm-states * num-sequences + num_sequences). [the last + // 'num_sequences' columns are for the beta-sums, which relates to leaky HMM.] + // Note: for efficiency and to simplify the equations, these are actually the + // beta / tot_prob_. + CuMatrix beta_; + + // the analogous beta quantities for the SMBR objective CuMatrix beta_smbr_; + // the total probability for each sequence, excluding the product of + // correction terms. [the correction terms refer to the fact that we multiply + // on each frame by 1/alpha of hmm-state 0 of the previous frame.]. + // After the correction terms the total probability is fairly close to 1, + // which is why we can store it as non-log. + CuVector tot_prob_; + + // the total smbr for each sequence. CuVector tot_smbr_; + + bool ok_; }; } // namespace chain diff --git a/src/chain/chain-denominator.h b/src/chain/chain-denominator.h index 4b5db013752..a4a417c8a5d 100644 --- a/src/chain/chain-denominator.h +++ b/src/chain/chain-denominator.h @@ -305,8 +305,6 @@ class DenominatorComputation { CuVector log_correction_term_; bool ok_; - - friend DenominatorSmbrComputation; }; diff --git a/src/chain/chain-kernels-ansi.h b/src/chain/chain-kernels-ansi.h index 388c78ab2ee..eba731a034a 100644 --- a/src/chain/chain-kernels-ansi.h +++ b/src/chain/chain-kernels-ansi.h @@ -48,6 +48,36 @@ extern "C" { const BaseFloat *prev_alpha, BaseFloat *this_alpha); + void cuda_chain_smbr_hmm_backward(dim3 Gr, dim3 Bl, + const Int32Pair *forward_transitions, + const DenominatorGraphTransition *transitions, + int32_cuda num_sequences, + int32_cuda num_hmm_states, + const BaseFloat *probs, + int32_cuda prob_stride, + const BaseFloat *num_post, + const BaseFloat *this_alpha, + const BaseFloat *this_alpha_smbr, + const BaseFloat *next_beta, + const BaseFloat *next_beta_smbr, + BaseFloat *this_beta, + BaseFloat *this_beta_smbr, + BaseFloat *log_prob_deriv, + int32_cuda log_prob_deriv_stride); + + void cuda_chain_smbr_hmm_forward(dim3 Gr, dim3 Bl, + const Int32Pair *backward_transitions, + const DenominatorGraphTransition *transitions, + int32_cuda num_sequences, + int32_cuda num_hmm_states, + const BaseFloat *probs, + int32_cuda prob_stride, + const BaseFloat *num_post, + const BaseFloat *prev_alpha, + const BaseFloat *prev_alpha_smbr, + BaseFloat *this_alpha, + BaseFloat *this_alpha_smbr); + } // extern "C" #endif // HAVE_CUDA diff --git a/src/chain/chain-kernels.cu b/src/chain/chain-kernels.cu index f093f21a5a5..3acceaa2bae 100644 --- a/src/chain/chain-kernels.cu +++ b/src/chain/chain-kernels.cu @@ -256,6 +256,212 @@ static void _cuda_chain_hmm_backward(const Int32Pair *forward_transitions, } +// one iteration of the forward computation in the chain HMM with +// SMBR objective. +// The grid y determines which HMM-state we handle. [put this in the grid because +// HMM-states don't all take the same amount of time in the backwards direction, and it's +// better for scheduling to have them at the outer level.] +// The block x and grid x determine which sequence (0 ... num_sequences - 1) we handle; +// note that num_sequences == the number of elements in the minibatch, and we +// insist they all have the same number of time steps. +// note: 'probs' is indexed by sequence-index + (pdf-index * prob_stride). +__global__ +static void _cuda_chain_smbr_hmm_forward( + const Int32Pair *backward_transitions, + const DenominatorGraphTransition *transitions, + int32_cuda num_sequences, + int32_cuda num_hmm_states, + const BaseFloat *probs, int32_cuda prob_stride, const BaseFloat *num_post, + const BaseFloat *prev_alpha, const BaseFloat *prev_alpha_smbr, + BaseFloat *this_alpha, BaseFloat *this_alpha_smbr) { + // 'backward_transitions', indexed by hmm-state, consists of [start, end] + // indexes into the 'transitions' array. This gives us the info for + // transitions *into* this state. 'probs' contains the exponentiated neural + // net outputs; it has dimension num-output-indexes by num_sequences and its + // stride is 'prob_stride'. 'prev_alpha' and 'this_alpha', which are + // extracted from a larger matrix, both have dimension num-history-states by + // num-sequences. 'prev_alpha_smbr' and 'this_alpha_smbr' are analogous + // for the partial SMBR values. + + // s is the index of the sequence within the minibatch, + // from 0 .. num-egs-in-this-minibatch - 1. + // h is the hmm-state index. + int32_cuda s = threadIdx.x + blockIdx.x * blockDim.x, + h = blockIdx.y; + if (s >= num_sequences) + return; + + // Let arbitrary_scale be the inverse of the sum of all alpha values on-- the + // previous frame this sum of all the alpha values is stored in the place that + // we'd store the previous alpha for state-index equal to num_hmm_states + // (i.e. one past the end). We multiply this into all the + // transition-probabilities from the previous frame to this frame, in both the + // forward and backward passes, in order to keep the alphas in a good numeric + // range. This won't affect the posteriors, as it's just a constant factor + // for each frame, but when computing the total likelihood we'll need to + // compensate for it later on. + BaseFloat arbitrary_scale = + 1.0 / prev_alpha[num_hmm_states * num_sequences + s]; + + double this_tot_alpha = 0.0, this_tot_alpha_smbr; + const DenominatorGraphTransition + *trans_iter = transitions + backward_transitions[h].first, + *trans_end = transitions + backward_transitions[h].second; + // Note: regarding this loop unrolling, I tried the automatic unrolling using + // #pragma unroll 2 (after modifying the loop to have an integer index), but I + // did not see any performance improvement, it was slightly slower. So the + // compiler must be doing something different than what I'm doing here. + const int loop_unroll = 2; // don't change this without changing the code + // below. + for (; trans_iter + loop_unroll <= trans_end; trans_iter += loop_unroll) { + BaseFloat transition_prob0 = trans_iter[0].transition_prob; + int32_cuda pdf_id0 = trans_iter[0].pdf_id, + prev_hmm_state0 = trans_iter[0].hmm_state; + BaseFloat transition_prob1 = trans_iter[1].transition_prob; + int32_cuda pdf_id1 = trans_iter[1].pdf_id, + prev_hmm_state1 = trans_iter[1].hmm_state; + BaseFloat pseudo_loglike0 = probs[pdf_id0 * prob_stride + s], + this_prev_alpha0 = prev_alpha[prev_hmm_state0 * num_sequences + s], + this_prev_alpha_smbr0 = + prev_alpha_smbr[prev_hmm_state0 * num_sequences + s], + pseudo_loglike1 = probs[pdf_id1 * prob_stride + s], + this_prev_alpha1 = prev_alpha[prev_hmm_state1 * num_sequences + s], + this_prev_alpha_smbr1 = prev_alpha[prev_hmm_state1 * num_sequences + s]; + + this_tot_alpha += this_prev_alpha0 * transition_prob0 * pseudo_loglike0 + + this_prev_alpha1 * transition_prob1 * pseudo_loglike1; + this_tot_alpha_smbr += + this_prev_alpha_smbr0 * this_prev_alpha0 / arbitrary_scale + + transition_prob0 * pseudo_loglike0 * num_post[pdf_id0] + + this_prev_alpha_smbr0 * this_prev_alpha1 / arbitrary_scale + + transition_prob1 * pseudo_loglike1 * num_post[pdf_id1]; + } + if (trans_iter != trans_end) { + // mop up the odd transition. + BaseFloat transition_prob0 = trans_iter[0].transition_prob; + int32_cuda pdf_id0 = trans_iter[0].pdf_id, + prev_hmm_state0 = trans_iter[0].hmm_state; + BaseFloat pseudo_loglike0 = probs[pdf_id0 * prob_stride + s], + this_prev_alpha0 = prev_alpha[prev_hmm_state0 * num_sequences + s], + this_prev_alpha_smbr0 = + prev_alpha_smbr[prev_hmm_state0 * num_sequences + s]; + this_tot_alpha += this_prev_alpha0 * transition_prob0 * pseudo_loglike0; + this_tot_alpha_smbr += + this_prev_alpha_smbr0 * this_prev_alpha0 / arbitrary_scale + + transition_prob0 * pseudo_loglike0 * num_post[pdf_id0]; + } + + this_alpha[h * num_sequences + s] = this_tot_alpha * arbitrary_scale; + this_alpha_smbr[h * num_sequences + s] = + this_tot_alpha_smbr / this_tot_alpha; +} + + +__global__ +static void _cuda_chain_smbr_hmm_backward( + const Int32Pair *forward_transitions, + const DenominatorGraphTransition *transitions, + int32_cuda num_sequences, int32_cuda num_hmm_states, + const BaseFloat *probs, int32_cuda prob_stride, const BaseFloat *num_post, + const BaseFloat *this_alpha, const BaseFloat *this_alpha_smbr, + const BaseFloat *next_beta, const BaseFloat *next_beta_smbr, + BaseFloat *this_beta, BaseFloat *this_beta_smbr, + BaseFloat *log_prob_deriv, int32_cuda log_prob_deriv_stride) { + // 'forward_transitions', indexed by hmm-state, consists of [start, end] + // indexes into the 'transition_info' array. This is about the transitions + // *out of* this state. 'probs' contains the exponentiated neural net + // outputs; it has dimension num-output-indexes by num_sequences, and contains + // just the observation probabilities for this time index. Its stride is + // prob_stride. + // 'this_alpha', 'next_beta' and 'this_beta' all have dimension + // num-history-states by num-sequences. + // 'this_alpha_smbr', 'next_beta_smbr', and 'this_beta_smbr' are + // analogous quantities storing values for SMBR objective. + // The beta probs are normalized in such a way (by multiplying by 1/(total-data-prob)) + // that to get occupation counts we don't need to multiply by 1/total-data-prob. + // deriv_scale is a factor (e.g. -1.0 or -0.99) that we multiply these derivs by + // while accumulating them. + + // s is the index of the sequence within the minibatch, + // from 0 .. num-egs-in-this-minibatch - 1. + // h is the hmm-state index. + int32_cuda s = threadIdx.x + blockIdx.x * blockDim.x, + h = blockIdx.y; + if (s >= num_sequences) + return; + + // See where arbitrary_scale is defined in the forward computation above, for + // more explanation of inv_arbitrary_scale. + BaseFloat this_alpha_prob = this_alpha[h * num_sequences + s], + this_alpha_smbr_i = this_alpha_smbr[h * num_sequences + s], + inv_arbitrary_scale = + this_alpha[num_hmm_states * num_sequences + s]; + double tot_variable_factor = 0.0, beta_smbr = 0.0; + + BaseFloat occupation_factor = this_alpha_prob / inv_arbitrary_scale; + const DenominatorGraphTransition + *trans_iter = transitions + forward_transitions[h].first, + *trans_end = transitions + forward_transitions[h].second; + const int loop_unroll = 2; // don't change this without changing the code + // below. + for (; trans_iter + loop_unroll <= trans_end; trans_iter += loop_unroll) { + BaseFloat transition_prob0 = trans_iter[0].transition_prob; + int32_cuda pdf_id0 = trans_iter[0].pdf_id, + next_hmm_state0 = trans_iter[0].hmm_state; + BaseFloat transition_prob1 = trans_iter[1].transition_prob; + int32_cuda pdf_id1 = trans_iter[1].pdf_id, + next_hmm_state1 = trans_iter[1].hmm_state; + BaseFloat next_beta_j0 = next_beta[next_hmm_state0 * num_sequences + s], + next_beta_smbr_j0 = next_beta_smbr[next_hmm_state0 * num_sequences + s], + next_beta_j1 = next_beta[next_hmm_state1 * num_sequences + s], + next_beta_smbr_j1 = next_beta_smbr[next_hmm_state1 * num_sequences + s], + prob0 = probs[pdf_id0 * prob_stride + s], + prob1 = probs[pdf_id1 * prob_stride + s], + num_post0 = num_post[pdf_id0], num_post1 = num_post[pdf_id1]; + + BaseFloat variable_factor0 = transition_prob0 * next_beta_j0 * prob0, + variable_factor1 = transition_prob1 * next_beta_j1 * prob1; + beta_smbr += next_beta_smbr_j0 * next_beta_j0 + + prob0 / inv_arbitrary_scale * transition_prob0 * num_post0 + + next_beta_smbr_j1 * next_beta_j1 + + prob1 / inv_arbitrary_scale * transition_prob1 * num_post1; + tot_variable_factor += variable_factor0 + variable_factor1; + BaseFloat this_gamma_r0 = occupation_factor * next_beta_j0 + * transition_prob0 * (this_alpha_smbr_i + num_post0 + + next_beta_smbr_j0 - tot_smbr[s]); + atomic_add_thresholded(log_prob_deriv + (pdf_id0 * log_prob_deriv_stride + s), + this_gamma_r0); + BaseFloat this_gamma_r1 = occupation_factor * next_beta_j1 + * transition_prob1 * (this_alpha_smbr_i + num_post1 + + next_beta_smbr_j1 - tot_smbr[s]); + atomic_add_thresholded(log_prob_deriv + (pdf_id1 * log_prob_deriv_stride + s), + this_gamma_r1); + } + if (trans_iter != trans_end) { + // mop up the odd transition. + BaseFloat transition_prob0 = trans_iter[0].transition_prob; + int32_cuda pdf_id0 = trans_iter[0].pdf_id, + next_hmm_state0 = trans_iter[0].hmm_state; + BaseFloat next_beta_j0 = next_beta[next_hmm_state0 * num_sequences + s], + next_beta_smbr_j0 = next_beta_smbr[next_hmm_state0 * num_sequences + s], + prob0 = probs[pdf_id0 * prob_stride + s], + num_post0 = num_post[pdf_id0]; + BaseFloat variable_factor0 = transition_prob0 * next_beta_j0 * prob0; + beta_smbr += next_beta_smbr_j0 * next_beta_j0 + + prob0 / inv_arbitrary_scale * transition_prob0 * num_post0; + tot_variable_factor += variable_factor0; + BaseFloat this_gamma_r0 = occupation_factor * next_beta_j0 + * transition_prob0 * (this_alpha_smbr_i + num_post0 + + next_beta_smbr_j0 - tot_smbr[s]); + atomic_add_thresholded(log_prob_deriv + (pdf_id0 * log_prob_deriv_stride + s), + this_gamma_r0); + } + BaseFloat beta = tot_variable_factor / inv_arbitrary_scale; + this_beta[h * num_sequences + s] = beta; + this_beta_smbr = beta_smbr / beta; +} + + void cuda_chain_hmm_forward(dim3 Gr, dim3 Bl, const Int32Pair *backward_transitions, const DenominatorGraphTransition *transitions, @@ -287,3 +493,43 @@ void cuda_chain_hmm_backward(dim3 Gr, dim3 Bl, this_beta, log_prob_deriv, log_prob_deriv_stride); } + +// Chain forward with SMBR objective +void cuda_chain_smbr_hmm_forward( + dim3 Gr, dim3 Bl, + const Int32Pair *backward_transitions, + const DenominatorGraphTransition *transitions, + int32_cuda num_sequences, + int32_cuda num_hmm_states, + const BaseFloat *probs, int32_cuda prob_stride, + const BaseFloat *num_post, + const BaseFloat *prev_alpha, const BaseFloat *prev_alpha_smbr, + BaseFloat *this_alpha, BaseFloat *this_alpha_smbr) { + _cuda_chain_smbr_hmm_forward<<>>( + backward_transitions, transitions, + num_sequences, num_hmm_states, + probs, prob_stride, num_post, + prev_alpha, prev_alpha_smbr, this_alpha, this_alpha_smbr); +} + +void cuda_chain_smbr_hmm_backward( + dim3 Gr, dim3 Bl, + const Int32Pair *forward_transitions, + const DenominatorGraphTransition *transitions, + int32_cuda num_sequences, + int32_cuda num_hmm_states, + const BaseFloat *probs, int32_cuda prob_stride, + const BaseFloat *num_post, + const BaseFloat *this_alpha, const BaseFloat *this_alpha_smbr, + const BaseFloat *next_beta, const BaseFloat *next_beta_smbr, + BaseFloat *this_beta, BaseFloat *this_beta_smbr, + BaseFloat *log_prob_deriv, + int32_cuda log_prob_deriv_stride) { + _cuda_chain_smbr_hmm_backward<<>>( + forward_transitions, transitions, + num_sequences, num_hmm_states, + probs, prob_stride, num_post, + this_alpha, this_alpha_smbr, next_beta, next_beta_smbr, + this_beta, this_beta_smbr, log_prob_deriv, + log_prob_deriv_stride); +} diff --git a/src/chain/chain-training.cc b/src/chain/chain-training.cc index 53de69a0e07..33be55ca94e 100644 --- a/src/chain/chain-training.cc +++ b/src/chain/chain-training.cc @@ -21,6 +21,7 @@ #include "chain/chain-kernels-ansi.h" #include "chain/chain-numerator.h" #include "chain/chain-denominator.h" +#include "chain/chain-denominator-smbr.h" namespace kaldi { namespace chain { @@ -110,6 +111,83 @@ void ComputeChainObjfAndDeriv(const ChainTrainingOptions &opts, } } +void ComputeChainSmbrObjfAndDeriv(const ChainTrainingOptions &opts, + const DenominatorGraph &den_graph, + const Supervision &supervision, + const CuMatrixBase &nnet_output, + BaseFloat *objf, + BaseFloat *l2_term, + BaseFloat *weight, + CuMatrixBase *nnet_output_deriv, + CuMatrixBase *xent_output_deriv) { + CuMatrix num_posteriors(nnet_output.NumRows(), + nnet_output.NumCols(), + kUndefined); + { + NumeratorComputation numerator(supervision, nnet_output); + // note: supervision.weight is included as a factor in the derivative from + // the numerator object, and the logprob too. + numerator.Forward(); + numerator.Backward(&num_posteriors); + if (xent_output_deriv) + xent_output_deriv->CopyFromMat(num_posteriors); + } + DenominatorSmbrComputation denominator(opts, den_graph, + supervision.num_sequences, + nnet_output, num_posteriors); + BaseFloat smbr_objf = denominator.ForwardSmbr(); + bool ok = true; + if (nnet_output_deriv) { + nnet_output_deriv->SetZero(); + ok = denominator.BackwardSmbr(supervision.weight, nnet_output_deriv); + } + + *objf = supervision.weight * smbr_objf; + *weight = supervision.weight * supervision.num_sequences * + supervision.frames_per_sequence; + if (!((*objf) - (*objf) == 0) || !ok) { + // inf or NaN detected, or denominator computation returned false. + if (nnet_output_deriv) + nnet_output_deriv->SetZero(); + if (xent_output_deriv) + xent_output_deriv->SetZero(); + BaseFloat default_objf = 0; + KALDI_WARN << "Objective function is " << (*objf) + << " and denominator computation (if done) returned " + << std::boolalpha << ok + << ", setting objective function to " << default_objf + << " per frame."; + *objf = default_objf * *weight; + } + + // This code helps us see how big the derivatives are, on average, + // for different frames of the sequences. As expected, they are + // smaller towards the edges of the sequences (due to the penalization + // of 'incorrect' pdf-ids. + if (GetVerboseLevel() >= 1 && nnet_output_deriv != NULL) { + int32 tot_frames = nnet_output_deriv->NumRows(), + frames_per_sequence = supervision.frames_per_sequence, + num_sequences = supervision.num_sequences; + CuVector row_products(tot_frames); + row_products.AddDiagMat2(1.0, *nnet_output_deriv, kNoTrans, 0.0); + Vector row_products_cpu(row_products); + Vector row_products_per_frame(frames_per_sequence); + for (int32 i = 0; i < tot_frames; i++) + row_products_per_frame(i / num_sequences) += row_products_cpu(i); + KALDI_LOG << "Derivs per frame are " << row_products_per_frame; + } + + if (opts.l2_regularize == 0.0) { + *l2_term = 0.0; + } else { + // compute the l2 penalty term and its derivative + BaseFloat scale = supervision.weight * opts.l2_regularize; + *l2_term = -0.5 * scale * TraceMatMat(nnet_output, nnet_output, kTrans); + if (nnet_output_deriv) + nnet_output_deriv->AddMat(-1.0 * scale, nnet_output); + } +} + } // namespace chain } // namespace kaldi diff --git a/src/chain/chain-training.h b/src/chain/chain-training.h index e6143d10846..beca5c0b92f 100644 --- a/src/chain/chain-training.h +++ b/src/chain/chain-training.h @@ -61,8 +61,10 @@ struct ChainTrainingOptions { // should have a softmax as its final nonlinearity. BaseFloat xent_regularize; + bool use_smbr_objective; + ChainTrainingOptions(): l2_regularize(0.0), leaky_hmm_coefficient(1.0e-05), - xent_regularize(0.0) { } + xent_regularize(0.0), use_smbr_objective(false) { } void Register(OptionsItf *opts) { opts->Register("l2-regularize", &l2_regularize, "l2 regularization " @@ -78,6 +80,8 @@ struct ChainTrainingOptions { "nonzero, the network is expected to have an output " "named 'output-xent', which should have a softmax as " "its final nonlinearity."); + opts->Register("use-smbr-objective", &use_smbr_objective, + "Use SMBR objective instead of MMI"); } }; @@ -122,6 +126,46 @@ void ComputeChainObjfAndDeriv(const ChainTrainingOptions &opts, CuMatrixBase *nnet_output_deriv, CuMatrixBase *xent_output_deriv = NULL); +/** + This function does both the numerator and denominator parts of the 'chain' + smbr computation in one call. + + @param [in] opts Struct containing options + @param [in] den_graph The denominator graph, derived from denominator fst. + @param [in] supervision The supervision object, containing the supervision + paths and constraints on the alignment as an FST + @param [in] nnet_output The output of the neural net; dimension must equal + ((supervision.num_sequences * supervision.frames_per_sequence) by + den_graph.NumPdfs()). The rows are ordered as: all sequences + for frame 0; all sequences for frame 1; etc. + @param [out] objf The smbr objective function computed for this + example; you'll want to divide it by 'tot_weight' before + displaying it. + @param [out] l2_term The l2 regularization term in the objective function, if + the --l2-regularize option is used. To be added to 'o + @param [out] weight The weight to normalize the objective function by; + equals supervision.weight * supervision.num_sequences * + supervision.frames_per_sequence. + @param [out] nnet_output_deriv The derivative of the objective function w.r.t. + the neural-net output. Only written to if non-NULL. + You don't have to zero this before passing to this function, + we zero it internally. + @param [out] xent_output_deriv If non-NULL, then the numerator part of the derivative + (which equals a posterior from the numerator forward-backward, + scaled by the supervision weight) is written to here. This will + be used in the cross-entropy regularization code. This value + is also used in computing the cross-entropy objective value. +*/ +void ComputeChainSmbrObjfAndDeriv( + const ChainTrainingOptions &opts, + const DenominatorGraph &den_graph, + const Supervision &supervision, + const CuMatrixBase &nnet_output, + BaseFloat *objf, + BaseFloat *l2_term, + BaseFloat *weight, + CuMatrixBase *nnet_output_deriv, + CuMatrixBase *xent_output_deriv = NULL); } // namespace chain diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu index b2fadf6e9e5..b4b15798a4a 100644 --- a/src/cudamatrix/cu-kernels.cu +++ b/src/cudamatrix/cu-kernels.cu @@ -407,6 +407,14 @@ static void _vec_mul_elements(Real* v, const Real* a, int dim) { v[i] = v[i] * a[i]; } +template +__global__ +static void _vec_div_elements(Real* v, const Real* a, int dim) { + int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < dim) + v[i] = v[i] / a[i]; +} + template __global__ static void _mul_cols_vec(Real* mat, const Real* scale, MatrixDim d) { @@ -3674,6 +3682,10 @@ void cudaF_vec_mul_elements(int Gr, int Bl, float* v, const float* a, int dim) { _vec_mul_elements<<>>(v, a, dim); } +void cudaF_vec_div_elements(int Gr, int Bl, float* v, const float* a, int dim) { + _vec_div_elements<<>>(v, a, dim); +} + void cudaF_vec_min(int Gr, int Bl, const float* v, float* value, int dim, int inc) { _vec_transform_reduce<<>>(v, value, dim, inc, @@ -4331,6 +4343,11 @@ void cudaD_vec_mul_elements(int Gr, int Bl, double* v, const double* a, _vec_mul_elements<<>>(v, a, dim); } +void cudaD_vec_div_elements(int Gr, int Bl, double* v, const double* a, + int dim) { + _vec_div_elements<<>>(v, a, dim); +} + void cudaD_vec_min(int Gr, int Bl, const double* v, double* value, int dim, int inc) { _vec_transform_reduce<<>>(v, value, dim, inc, diff --git a/src/cudamatrix/cu-kernels.h b/src/cudamatrix/cu-kernels.h index a2c4aaceb3d..96319a37652 100644 --- a/src/cudamatrix/cu-kernels.h +++ b/src/cudamatrix/cu-kernels.h @@ -1326,6 +1326,14 @@ inline void cuda_vec_mul_elements(int Gr, int Bl, float* v, const float* a, int dim) { cudaF_vec_mul_elements(Gr, Bl, v, a, dim); } +inline void cuda_vec_div_elements(int Gr, int Bl, double* v, const double* a, + int dim) { + cudaD_vec_div_elements(Gr, Bl, v, a, dim); +} +inline void cuda_vec_div_elements(int Gr, int Bl, float* v, const float* a, + int dim) { + cudaF_vec_div_elements(Gr, Bl, v, a, dim); +} inline void cuda_vec_soft_max(int Gr, int Bl, double* v, int dim) { cudaD_vec_soft_max(Gr, Bl, v, dim); } diff --git a/src/cudamatrix/cu-vector.cc b/src/cudamatrix/cu-vector.cc index 595fd0aaa72..491dcf198c4 100644 --- a/src/cudamatrix/cu-vector.cc +++ b/src/cudamatrix/cu-vector.cc @@ -767,6 +767,25 @@ void CuVectorBase::MulElements(const CuVectorBase &v) { } } +template +void CuVectorBase::DivElements(const CuVectorBase &v) { + KALDI_ASSERT(dim_ == v.dim_); +#if HAVE_CUDA == 1 + if (CuDevice::Instantiate().Enabled()) { + if (dim_ == 0) return; + CuTimer tim; + int dimBlock(CU1DBLOCK); + int dimGrid(n_blocks(dim_, CU1DBLOCK)); + cuda_vec_div_elements(dimGrid, dimBlock, data_, v.Data(), dim_); + CU_SAFE_CALL(cudaGetLastError()); + CuDevice::Instantiate().AccuProfile("CuVectorBase::DivElements", tim); + } else +#endif + { + Vec().DivElements(v.Vec()); + } +} + template<> template<> void CuVectorBase::CopyFromVec(const CuVectorBase &src) { diff --git a/src/cudamatrix/cu-vector.h b/src/cudamatrix/cu-vector.h index 53641556669..a72314b50f0 100644 --- a/src/cudamatrix/cu-vector.h +++ b/src/cudamatrix/cu-vector.h @@ -195,6 +195,9 @@ class CuVectorBase { void ReplaceValue(Real orig, Real changed); void MulElements(const CuVectorBase &v); + + void DivElements(const CuVectorBase &v); + // The following two functions should only be called if we did not compile // with CUDA or could not get a CUDA card; in that case the contents are // interpreted the same as a regular vector. diff --git a/src/nnet3/nnet-chain-diagnostics.cc b/src/nnet3/nnet-chain-diagnostics.cc index 084b33347df..febafa0c945 100644 --- a/src/nnet3/nnet-chain-diagnostics.cc +++ b/src/nnet3/nnet-chain-diagnostics.cc @@ -137,11 +137,19 @@ void NnetChainComputeProb::ProcessOutputs(const NnetChainExample &eg, BaseFloat tot_like, tot_l2_term, tot_weight; - ComputeChainObjfAndDeriv(chain_config_, den_graph_, - sup.supervision, nnet_output, - &tot_like, &tot_l2_term, &tot_weight, - (nnet_config_.compute_deriv ? &nnet_output_deriv : - NULL), (use_xent ? &xent_deriv : NULL)); + if (chain_config_.use_smbr_objective) + ComputeChainSmbrObjfAndDeriv( + chain_config_, den_graph_, + sup.supervision, nnet_output, + &tot_like, &tot_l2_term, &tot_weight, + (nnet_config_.compute_deriv ? &nnet_output_deriv : + NULL), (use_xent ? &xent_deriv : NULL)); + else + ComputeChainObjfAndDeriv(chain_config_, den_graph_, + sup.supervision, nnet_output, + &tot_like, &tot_l2_term, &tot_weight, + (nnet_config_.compute_deriv ? &nnet_output_deriv : + NULL), (use_xent ? &xent_deriv : NULL)); // note: in this context we don't want to apply 'sup.deriv_weights' because // this code is used only in combination, where it's part of an L-BFGS diff --git a/src/nnet3/nnet-chain-training.cc b/src/nnet3/nnet-chain-training.cc index d657581eaca..7ce2218b080 100644 --- a/src/nnet3/nnet-chain-training.cc +++ b/src/nnet3/nnet-chain-training.cc @@ -180,11 +180,19 @@ void NnetChainTrainer::ProcessOutputs(bool is_backstitch_step2, BaseFloat tot_objf, tot_l2_term, tot_weight; - ComputeChainObjfAndDeriv(opts_.chain_config, den_graph_, - sup.supervision, nnet_output, - &tot_objf, &tot_l2_term, &tot_weight, - &nnet_output_deriv, - (use_xent ? &xent_deriv : NULL)); + if (opts_.chain_config.use_smbr_objective) { + ComputeChainSmbrObjfAndDeriv(opts_.chain_config, den_graph_, + sup.supervision, nnet_output, + &tot_objf, &tot_l2_term, &tot_weight, + &nnet_output_deriv, + (use_xent ? &xent_deriv : NULL)); + } else { + ComputeChainObjfAndDeriv(opts_.chain_config, den_graph_, + sup.supervision, nnet_output, + &tot_objf, &tot_l2_term, &tot_weight, + &nnet_output_deriv, + (use_xent ? &xent_deriv : NULL)); + } if (use_xent) { // this block computes the cross-entropy objective. From 2c43456dc2ee2f12b54c27124f3559d84e21c5de Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Thu, 22 Jun 2017 02:58:39 -0400 Subject: [PATCH 012/174] chain-smbr: Bug fixes --- src/chain/chain-denominator-smbr.cc | 13 ++++++------- src/chain/chain-denominator-smbr.h | 8 ++++---- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/src/chain/chain-denominator-smbr.cc b/src/chain/chain-denominator-smbr.cc index 752f94af504..d5da45eeb09 100644 --- a/src/chain/chain-denominator-smbr.cc +++ b/src/chain/chain-denominator-smbr.cc @@ -180,8 +180,8 @@ void DenominatorSmbrComputation::AlphaSmbrGeneralFrame(int32 t) { this_tot_alpha += this_prev_alpha * transition_prob * prob; KALDI_ASSERT(num_posteriors_(t, pdf_id) > -1e-20); this_tot_alpha_smbr += - this_prev_alpha_smbr * this_prev_alpha / arbitrary_scale - + transition_prob * prob * num_posteriors_(t, pdf_id); + (this_prev_alpha_smbr + num_posteriors_(t, pdf_id)) + * this_prev_alpha * transition_prob * prob; } KALDI_ASSERT(this_tot_alpha - this_tot_alpha == 0); this_alpha[h * num_sequences + s] = this_tot_alpha * arbitrary_scale; @@ -419,9 +419,9 @@ void DenominatorSmbrComputation::BetaSmbrGeneralFrame(int32 t) { next_beta_smbr_j = next_beta_smbr[next_hmm_state + num_sequences + s]; BaseFloat variable_factor = transition_prob * next_beta_j * prob_data[pdf_id * prob_stride + s]; - beta_smbr += next_beta_smbr_j * next_beta_j - + prob_data[pdf_id * prob_stride + s] / inv_arbitrary_scale - * transition_prob * num_posteriors_(t, pdf_id); + beta_smbr += (next_beta_smbr_j + num_posteriors_(t, pdf_id)) + * next_beta_j * prob_data[pdf_id * prob_stride + s] + * transition_prob; tot_variable_factor += variable_factor; double this_gamma_r = occupation_factor * next_beta_j * transition_prob * (this_alpha_smbr_i + num_posteriors_(t, pdf_id) @@ -430,8 +430,7 @@ void DenominatorSmbrComputation::BetaSmbrGeneralFrame(int32 t) { } this_beta_dash[h * num_sequences + s] = tot_variable_factor / inv_arbitrary_scale; - this_beta_smbr[h * num_sequences + s] = beta_smbr / this_beta_dash[h * num_sequences + s]; - + this_beta_smbr[h * num_sequences + s] = beta_smbr / tot_variable_factor; } } } diff --git a/src/chain/chain-denominator-smbr.h b/src/chain/chain-denominator-smbr.h index 663f9387d23..c8b1855664c 100644 --- a/src/chain/chain-denominator-smbr.h +++ b/src/chain/chain-denominator-smbr.h @@ -79,7 +79,7 @@ namespace chain { alpha_r(t, i) = 0 for (j, p, n) in pred(i): # note: j is preceding-state. alpha(t, i) += x(t-1, n) * alpha(t-1, j) * p - alpha_r(t, i) += alpha_r(t-1, j) * alpha(t-1, j) + x(t-1, n) * p * (ref_pdf == pdf ? 1.0 : 0.0) + alpha_r(t, i) += (alpha_r(t-1, j) + (ref_pdf == pdf ? 1.0 : 0.0)) * alpha(t-1, j) * x(t-1, n) * p alpha_r(t, i) /= alpha(t, i) - total-prob = \sum_i alpha(T, i). # note, we take the final-probs of all states @@ -109,7 +109,7 @@ namespace chain { beta_r(t, i) = 0 for (j, p, n) in foll(i): # note: j is following-state. beta(t, i) += x(t, n) * beta(t+1, j) * p. - beta_r(t, i) += beta(t+1, j) * beta_r(t+1, j) + x(t, n) * p * (ref_pdf == pdf ? 1.0 : 0) + beta_r(t, i) += (beta_r(t+1, j) + (ref_pdf == pdf ? 1.0 : 0)) * beta(t+1, j) * x(t, n) * p gamma(t, n) += alpha(t, i) * x(t, n) * beta(t+1, j) * p. gamma_r(t, n) += alpha(t, i) * x(t, n) * beta(t+1, j) * p * (alpha_r(t, i) + (ref_pdf == pdf ? 1.0 : 0) + beta_r(t+1, j) - tot_objf) beta_r(t, i) /= beta(t, i) @@ -168,7 +168,7 @@ namespace chain { alpha_r(t, i) = 0 for (j, p, n) in pred(i): # note: j is preceding-state. alpha(t, i) += alpha'(t-1, j) * p * x(t-1, n) / tot-alpha(t-1) - alpha_r(t, i) += alpha_r(t-1, j) * alpha'(t-1, j) + x(t-1, n) / tot-alpha(t-1) * p * (ref_pdf == pdf ? 1.0 : 0.0) + alpha_r(t, i) += (alpha_r(t-1, j) + (ref_pdf == pdf ? 1.0 : 0.0)) * alpha'(t-1, j) * x(t-1, n) / tot-alpha(t-1) * p alpha_r(t, i) /= alpha(t,i) - total-prob = \sum_i alpha'(T, i) @@ -196,7 +196,7 @@ namespace chain { beta'(t, i) = 0 for (j, p, n) in foll(i): # note: j is following-state. beta'(t, i) += beta(t+1, j) * p * x(t, n) / tot-alpha(t) - beta_r(t, i) += beta(t+1, j) * beta_r(t+1, j) + x(t, n) / tot-alpha(t) * p * (ref_pdf == pdf ? 1.0 : 0) + beta_r(t, i) += (beta_r(t+1, j) + (ref_pdf == pdf ? 1.0 : 0)) * beta(t+1, j) * x(t, n) / tot-alpha(t) * p gamma(t, n) += alpha'(t, i) * beta(t+1, j) * p * x(t, n) / tot-alpha(t) gamma_r(t, n) += alpha'(t, i) * x(t, n) / tot-alpha(t) * beta(t+1, j) * p * (alpha_r(t, i) + (ref_pdf == pdf ? 1.0 : 0.0) + beta_r(t+1, j) - tot_objf) beta_r(t, i) /= beta(t, i) From 6adc948ee5de3a53ec88ee11205e76de067fb963 Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Thu, 22 Jun 2017 02:57:07 -0400 Subject: [PATCH 013/174] Chain SMBR fixes Conflicts: src/chain/chain-denominator-smbr.cc --- src/chain/chain-denominator-smbr.cc | 15 +++++++++++---- src/chain/chain-kernels-ansi.h | 1 + src/chain/chain-kernels.cu | 22 ++++++++++++++-------- src/chain/chain-supervision-test.cc | 12 +++++++++--- src/chain/chain-training.cc | 3 +-- src/chain/language-model-test.cc | 2 +- src/chainbin/nnet3-chain-combine.cc | 2 +- src/cudamatrix/cu-kernels-ansi.h | 3 +++ src/nnet3/nnet-chain-diagnostics.cc | 10 +++++++--- src/nnet3/nnet-chain-diagnostics.h | 3 ++- 10 files changed, 50 insertions(+), 23 deletions(-) diff --git a/src/chain/chain-denominator-smbr.cc b/src/chain/chain-denominator-smbr.cc index d5da45eeb09..bf1c8389879 100644 --- a/src/chain/chain-denominator-smbr.cc +++ b/src/chain/chain-denominator-smbr.cc @@ -131,7 +131,7 @@ void DenominatorSmbrComputation::AlphaSmbrGeneralFrame(int32 t) { backward_transitions, transitions, num_sequences, den_graph_.NumStates(), prob_data, probs.Stride(), - num_posteriors_.Row(t).Data(), + num_posteriors_.Row(t).Data(), prev_alpha_dash, prev_alpha_smbr, this_alpha, this_alpha_smbr); CU_SAFE_CALL(cudaGetLastError()); @@ -184,8 +184,12 @@ void DenominatorSmbrComputation::AlphaSmbrGeneralFrame(int32 t) { * this_prev_alpha * transition_prob * prob; } KALDI_ASSERT(this_tot_alpha - this_tot_alpha == 0); + KALDI_ASSERT(this_tot_alpha_smbr - this_tot_alpha_smbr == 0); this_alpha[h * num_sequences + s] = this_tot_alpha * arbitrary_scale; - this_alpha_smbr[h * num_sequences + s] = this_tot_alpha_smbr / this_tot_alpha; + if (this_tot_alpha > 0.0) { + this_alpha_smbr[h * num_sequences + s] = + this_tot_alpha_smbr / this_tot_alpha; + } } } } @@ -372,6 +376,7 @@ void DenominatorSmbrComputation::BetaSmbrGeneralFrame(int32 t) { dimGrid, dimBlock, forward_transitions, transitions, num_sequences, num_hmm_states, probs.Data(), probs.Stride(), + num_posteriors_.Row(t).Data(), tot_smbr_.Data(), this_alpha_dash, this_alpha_smbr, next_beta, next_beta_smbr, this_beta_dash, this_beta_smbr, @@ -430,6 +435,7 @@ void DenominatorSmbrComputation::BetaSmbrGeneralFrame(int32 t) { } this_beta_dash[h * num_sequences + s] = tot_variable_factor / inv_arbitrary_scale; + if (tot_variable_factor > 0.0) this_beta_smbr[h * num_sequences + s] = beta_smbr / tot_variable_factor; } } @@ -448,8 +454,9 @@ void DenominatorSmbrComputation::BetaSmbrGeneralFrameDebug(int32 t) { CuSubMatrix this_log_prob_deriv( nnet_output_deriv_transposed_, 0, num_pdfs, t_wrapped * num_sequences_, num_sequences_); - BaseFloat alpha_beta_product = VecVec(this_alpha_dash, - this_beta_dash), + BaseFloat alpha_beta_product = (VecVec(this_alpha_dash, this_beta_smbr) + + VecVec(this_alpha_smbr, this_beta_dash)) + / VecVec(this_alpha_dash, this_beta_dash), this_log_prob_deriv_sum = this_log_prob_deriv.Sum(); if (!ApproxEqual(alpha_beta_product, num_sequences_)) { KALDI_WARN << "On time " << t << ", alpha-beta product " diff --git a/src/chain/chain-kernels-ansi.h b/src/chain/chain-kernels-ansi.h index eba731a034a..88656f425f1 100644 --- a/src/chain/chain-kernels-ansi.h +++ b/src/chain/chain-kernels-ansi.h @@ -56,6 +56,7 @@ extern "C" { const BaseFloat *probs, int32_cuda prob_stride, const BaseFloat *num_post, + const BaseFloat *tot_smbr, const BaseFloat *this_alpha, const BaseFloat *this_alpha_smbr, const BaseFloat *next_beta, diff --git a/src/chain/chain-kernels.cu b/src/chain/chain-kernels.cu index 3acceaa2bae..0ed80f93659 100644 --- a/src/chain/chain-kernels.cu +++ b/src/chain/chain-kernels.cu @@ -303,7 +303,7 @@ static void _cuda_chain_smbr_hmm_forward( BaseFloat arbitrary_scale = 1.0 / prev_alpha[num_hmm_states * num_sequences + s]; - double this_tot_alpha = 0.0, this_tot_alpha_smbr; + double this_tot_alpha = 0.0, this_tot_alpha_smbr = 0.0; const DenominatorGraphTransition *trans_iter = transitions + backward_transitions[h].first, *trans_end = transitions + backward_transitions[h].second; @@ -333,7 +333,7 @@ static void _cuda_chain_smbr_hmm_forward( this_tot_alpha_smbr += this_prev_alpha_smbr0 * this_prev_alpha0 / arbitrary_scale + transition_prob0 * pseudo_loglike0 * num_post[pdf_id0] - + this_prev_alpha_smbr0 * this_prev_alpha1 / arbitrary_scale + + this_prev_alpha_smbr1 * this_prev_alpha1 / arbitrary_scale + transition_prob1 * pseudo_loglike1 * num_post[pdf_id1]; } if (trans_iter != trans_end) { @@ -352,8 +352,11 @@ static void _cuda_chain_smbr_hmm_forward( } this_alpha[h * num_sequences + s] = this_tot_alpha * arbitrary_scale; - this_alpha_smbr[h * num_sequences + s] = - this_tot_alpha_smbr / this_tot_alpha; + + if (this_tot_alpha > 0.0) { + this_alpha_smbr[h * num_sequences + s] = + this_tot_alpha_smbr / this_tot_alpha; + } } @@ -362,7 +365,8 @@ static void _cuda_chain_smbr_hmm_backward( const Int32Pair *forward_transitions, const DenominatorGraphTransition *transitions, int32_cuda num_sequences, int32_cuda num_hmm_states, - const BaseFloat *probs, int32_cuda prob_stride, const BaseFloat *num_post, + const BaseFloat *probs, int32_cuda prob_stride, + const BaseFloat *num_post, const BaseFloat *tot_smbr, const BaseFloat *this_alpha, const BaseFloat *this_alpha_smbr, const BaseFloat *next_beta, const BaseFloat *next_beta_smbr, BaseFloat *this_beta, BaseFloat *this_beta_smbr, @@ -458,7 +462,9 @@ static void _cuda_chain_smbr_hmm_backward( } BaseFloat beta = tot_variable_factor / inv_arbitrary_scale; this_beta[h * num_sequences + s] = beta; - this_beta_smbr = beta_smbr / beta; + + if (beta > 0.0) + this_beta_smbr[h * num_sequences + s] = beta_smbr / beta; } @@ -519,7 +525,7 @@ void cuda_chain_smbr_hmm_backward( int32_cuda num_sequences, int32_cuda num_hmm_states, const BaseFloat *probs, int32_cuda prob_stride, - const BaseFloat *num_post, + const BaseFloat *num_post, const BaseFloat *tot_smbr, const BaseFloat *this_alpha, const BaseFloat *this_alpha_smbr, const BaseFloat *next_beta, const BaseFloat *next_beta_smbr, BaseFloat *this_beta, BaseFloat *this_beta_smbr, @@ -528,7 +534,7 @@ void cuda_chain_smbr_hmm_backward( _cuda_chain_smbr_hmm_backward<<>>( forward_transitions, transitions, num_sequences, num_hmm_states, - probs, prob_stride, num_post, + probs, prob_stride, num_post, tot_smbr, this_alpha, this_alpha_smbr, next_beta, next_beta_smbr, this_beta, this_beta_smbr, log_prob_deriv, log_prob_deriv_stride); diff --git a/src/chain/chain-supervision-test.cc b/src/chain/chain-supervision-test.cc index 7bf3c17854a..0c8d9f68aad 100644 --- a/src/chain/chain-supervision-test.cc +++ b/src/chain/chain-supervision-test.cc @@ -260,9 +260,15 @@ void ChainTrainingTest(const DenominatorGraph &den_graph, BaseFloat objf, l2_term, weight; - ComputeChainObjfAndDeriv(opts, den_graph, supervision, - nnet_output, &objf, &l2_term, &weight, - &nnet_output_deriv); + if (RandInt(0, 1) == 1) { + ComputeChainObjfAndDeriv(opts, den_graph, supervision, + nnet_output, &objf, &l2_term, &weight, + &nnet_output_deriv); + } else { + ComputeChainSmbrObjfAndDeriv(opts, den_graph, supervision, + nnet_output, &objf, &l2_term, &weight, + &nnet_output_deriv); + } { // make sure each row of nnet_output_deriv sums to one (shift invariance of diff --git a/src/chain/chain-training.cc b/src/chain/chain-training.cc index 33be55ca94e..a1332156be9 100644 --- a/src/chain/chain-training.cc +++ b/src/chain/chain-training.cc @@ -121,8 +121,7 @@ void ComputeChainSmbrObjfAndDeriv(const ChainTrainingOptions &opts, CuMatrixBase *nnet_output_deriv, CuMatrixBase *xent_output_deriv) { CuMatrix num_posteriors(nnet_output.NumRows(), - nnet_output.NumCols(), - kUndefined); + nnet_output.NumCols()); { NumeratorComputation numerator(supervision, nnet_output); // note: supervision.weight is included as a factor in the derivative from diff --git a/src/chain/language-model-test.cc b/src/chain/language-model-test.cc index 04a57441ada..286b3afc115 100644 --- a/src/chain/language-model-test.cc +++ b/src/chain/language-model-test.cc @@ -86,7 +86,7 @@ void LanguageModelTest() { LanguageModelEstimator estimator(opts); for (size_t i = 0; i < data.size(); i++) { std::vector &sentence = data[i]; - estimator.AddCounts(sentence); + estimator.AddCounts(sentence, 1); } fst::StdVectorFst fst; diff --git a/src/chainbin/nnet3-chain-combine.cc b/src/chainbin/nnet3-chain-combine.cc index 3c44e6b904c..a5ef5ac04e8 100644 --- a/src/chainbin/nnet3-chain-combine.cc +++ b/src/chainbin/nnet3-chain-combine.cc @@ -117,7 +117,7 @@ int main(int argc, char *argv[]) { nnet = combiner.GetNnet(); if (HasBatchnorm(nnet)) - RecomputeStats(egs, chain_config, den_fst, &nnet); + RecomputeStats(egs, chain_config, den_fst, den_fst_to_output, &nnet); #if HAVE_CUDA==1 CuDevice::Instantiate().PrintProfile(); diff --git a/src/cudamatrix/cu-kernels-ansi.h b/src/cudamatrix/cu-kernels-ansi.h index 3b02b266a01..f8d56972a3e 100644 --- a/src/cudamatrix/cu-kernels-ansi.h +++ b/src/cudamatrix/cu-kernels-ansi.h @@ -675,6 +675,9 @@ void cudaF_vec_min(int Gr, int Bl, const float* v, float* value, int dim, void cudaD_vec_mul_elements(int Gr, int Bl, double* v, const double* a, int dim); void cudaF_vec_mul_elements(int Gr, int Bl, float* v, const float* a, int dim); +void cudaD_vec_div_elements(int Gr, int Bl, double* v, const double* a, + int dim); +void cudaF_vec_div_elements(int Gr, int Bl, float* v, const float* a, int dim); void cudaD_vec_soft_max(int Gr, int Bl, double* v, int dim); void cudaF_vec_soft_max(int Gr, int Bl, float* v, int dim); void cudaD_vec_sum(int Gr, int Bl, double* v, double* value, int dim, int inc); diff --git a/src/nnet3/nnet-chain-diagnostics.cc b/src/nnet3/nnet-chain-diagnostics.cc index febafa0c945..95d38d57679 100644 --- a/src/nnet3/nnet-chain-diagnostics.cc +++ b/src/nnet3/nnet-chain-diagnostics.cc @@ -54,12 +54,14 @@ NnetChainComputeProb::NnetChainComputeProb( Nnet *nnet): nnet_config_(nnet_config), chain_config_(chain_config), - den_graph_(den_fst, nnet->OutputDim("output")), nnet_(*nnet), compiler_(*nnet, nnet_config_.optimize_config, nnet_config_.compiler_config), deriv_nnet_owned_(false), deriv_nnet_(nnet), num_minibatches_processed_(0) { + chain::DenominatorGraph den_graph(den_fst, nnet->OutputDim("output")); + KALDI_ASSERT(den_graph.NumPdfs() > 0); + den_graph_.insert(std::make_pair("output", den_graph)); KALDI_ASSERT(nnet_config.store_component_stats && !nnet_config.compute_deriv); } @@ -227,7 +229,8 @@ const ChainObjectiveInfo* NnetChainComputeProb::GetObjective( void RecomputeStats(const std::vector &egs, const chain::ChainTrainingOptions &chain_config_in, - const fst::StdVectorFst &den_fst, + const std::vector &den_fst, + const std::vector &den_to_output, Nnet *nnet) { KALDI_LOG << "Recomputing stats on nnet (affects batch-norm)"; chain::ChainTrainingOptions chain_config(chain_config_in); @@ -242,7 +245,8 @@ void RecomputeStats(const std::vector &egs, ZeroComponentStats(nnet); NnetComputeProbOptions nnet_config; nnet_config.store_component_stats = true; - NnetChainComputeProb prob_computer(nnet_config, chain_config, den_fst, nnet); + NnetChainComputeProb prob_computer(nnet_config, chain_config, den_fst, + den_to_output, *nnet); for (size_t i = 0; i < egs.size(); i++) prob_computer.Compute(egs[i]); prob_computer.PrintTotalStats(); diff --git a/src/nnet3/nnet-chain-diagnostics.h b/src/nnet3/nnet-chain-diagnostics.h index 4125427c463..047fed55dbb 100644 --- a/src/nnet3/nnet-chain-diagnostics.h +++ b/src/nnet3/nnet-chain-diagnostics.h @@ -111,7 +111,8 @@ class NnetChainComputeProb { /// declared in nnet-utils.h. void RecomputeStats(const std::vector &egs, const chain::ChainTrainingOptions &chain_config, - const fst::StdVectorFst &den_fst, + const std::vector &den_fst, + const std::vector &den_to_output, Nnet *nnet); From 29592794ad6459394a053581c17f5394b8e39aae Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Thu, 22 Jun 2017 03:06:15 -0400 Subject: [PATCH 014/174] chain-smbr: Bug fixes --- src/chain/chain-kernels.cu | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/src/chain/chain-kernels.cu b/src/chain/chain-kernels.cu index 0ed80f93659..aa9d36abcfb 100644 --- a/src/chain/chain-kernels.cu +++ b/src/chain/chain-kernels.cu @@ -331,10 +331,10 @@ static void _cuda_chain_smbr_hmm_forward( this_tot_alpha += this_prev_alpha0 * transition_prob0 * pseudo_loglike0 + this_prev_alpha1 * transition_prob1 * pseudo_loglike1; this_tot_alpha_smbr += - this_prev_alpha_smbr0 * this_prev_alpha0 / arbitrary_scale - + transition_prob0 * pseudo_loglike0 * num_post[pdf_id0] - + this_prev_alpha_smbr1 * this_prev_alpha1 / arbitrary_scale - + transition_prob1 * pseudo_loglike1 * num_post[pdf_id1]; + (this_prev_alpha_smbr0 + num_post[pdf_id0]) * this_prev_alpha0 + * transition_prob0 * pseudo_loglike0 + + (this_prev_alpha_smbr1 + num_post[pdf_id1] * this_prev_alpha1 + * transition_prob1 * pseudo_loglike1; } if (trans_iter != trans_end) { // mop up the odd transition. @@ -347,8 +347,8 @@ static void _cuda_chain_smbr_hmm_forward( prev_alpha_smbr[prev_hmm_state0 * num_sequences + s]; this_tot_alpha += this_prev_alpha0 * transition_prob0 * pseudo_loglike0; this_tot_alpha_smbr += - this_prev_alpha_smbr0 * this_prev_alpha0 / arbitrary_scale - + transition_prob0 * pseudo_loglike0 * num_post[pdf_id0]; + (this_prev_alpha_smbr0 + num_post[pdf_id0]) * this_prev_alpha0 + * transition_prob0 * pseudo_loglike0; } this_alpha[h * num_sequences + s] = this_tot_alpha * arbitrary_scale; @@ -425,10 +425,10 @@ static void _cuda_chain_smbr_hmm_backward( BaseFloat variable_factor0 = transition_prob0 * next_beta_j0 * prob0, variable_factor1 = transition_prob1 * next_beta_j1 * prob1; - beta_smbr += next_beta_smbr_j0 * next_beta_j0 - + prob0 / inv_arbitrary_scale * transition_prob0 * num_post0 - + next_beta_smbr_j1 * next_beta_j1 - + prob1 / inv_arbitrary_scale * transition_prob1 * num_post1; + beta_smbr += (next_beta_smbr_j0 + num_post0) * next_beta_j0 + * prob0 * transition_prob0 + + (next_beta_smbr_j1 + num_post1) * next_beta_j1 + * prob1 * transition_prob1; tot_variable_factor += variable_factor0 + variable_factor1; BaseFloat this_gamma_r0 = occupation_factor * next_beta_j0 * transition_prob0 * (this_alpha_smbr_i + num_post0 @@ -451,8 +451,8 @@ static void _cuda_chain_smbr_hmm_backward( prob0 = probs[pdf_id0 * prob_stride + s], num_post0 = num_post[pdf_id0]; BaseFloat variable_factor0 = transition_prob0 * next_beta_j0 * prob0; - beta_smbr += next_beta_smbr_j0 * next_beta_j0 - + prob0 / inv_arbitrary_scale * transition_prob0 * num_post0; + beta_smbr += (next_beta_smbr_j0 + num_post0) * next_beta_j0 + * prob0 * transition_prob0; tot_variable_factor += variable_factor0; BaseFloat this_gamma_r0 = occupation_factor * next_beta_j0 * transition_prob0 * (this_alpha_smbr_i + num_post0 From 758e9a4e9487beb1db388b3d54d091bd98cc19e9 Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Thu, 22 Jun 2017 14:27:08 -0400 Subject: [PATCH 015/174] chain-smbr: Bug fix --- src/chain/chain-denominator-smbr.cc | 40 +++++++++++++++++++---------- src/chain/chain-kernels-ansi.h | 2 ++ src/chain/chain-kernels.cu | 29 +++++++++++++-------- src/chain/chain-training.cc | 5 ++++ 4 files changed, 52 insertions(+), 24 deletions(-) diff --git a/src/chain/chain-denominator-smbr.cc b/src/chain/chain-denominator-smbr.cc index bf1c8389879..4b705267fcd 100644 --- a/src/chain/chain-denominator-smbr.cc +++ b/src/chain/chain-denominator-smbr.cc @@ -29,13 +29,13 @@ DenominatorSmbrComputation::DenominatorSmbrComputation( const DenominatorGraph &den_graph, int32 num_sequences, const CuMatrixBase &nnet_output, - const CuMatrixBase &num_posteriors): + const CuMatrixBase &numerator_posteriors): opts_(opts), den_graph_(den_graph), num_sequences_(num_sequences), frames_per_sequence_(nnet_output.NumRows() / num_sequences_), exp_nnet_output_transposed_(nnet_output, kTrans), - num_posteriors_(num_posteriors), + numerator_posteriors_transposed(numerator_posteriors, kTrans), nnet_output_deriv_transposed_( exp_nnet_output_transposed_.NumRows(), std::min(exp_nnet_output_transposed_.NumCols(), @@ -118,6 +118,12 @@ void DenominatorSmbrComputation::AlphaSmbrGeneralFrame(int32 t) { (t-1) * num_sequences_, num_sequences_); const BaseFloat *prob_data = probs.Data(); + // 'numerator_post' is the matrix of numerator posteriors for frame t - 1. + CuSubMatrix numerator_post( + numerator_posteriors_transposed_, 0, num_pdfs, + (t-1) * num_sequences_, num_sequences_); + const BaseFloat *post_data = numerator_post.Data(); + #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { CuTimer tim; @@ -131,7 +137,7 @@ void DenominatorSmbrComputation::AlphaSmbrGeneralFrame(int32 t) { backward_transitions, transitions, num_sequences, den_graph_.NumStates(), prob_data, probs.Stride(), - num_posteriors_.Row(t).Data(), + post_data, numerator_post.Stride(), prev_alpha_dash, prev_alpha_smbr, this_alpha, this_alpha_smbr); CU_SAFE_CALL(cudaGetLastError()); @@ -152,7 +158,8 @@ void DenominatorSmbrComputation::AlphaSmbrGeneralFrame(int32 t) { } else #endif { - int32 prob_stride = probs.Stride(); + int32 prob_stride = probs.Stride(), + post_stride = numerator_post.Stride(); for (int32 h = 0; h < num_hmm_states; h++) { for (int32 s = 0; s < num_sequences; s++) { // Let arbitrary_scale be the inverse of the alpha-sum value that we @@ -175,12 +182,13 @@ void DenominatorSmbrComputation::AlphaSmbrGeneralFrame(int32 t) { int32 pdf_id = trans_iter->pdf_id, prev_hmm_state = trans_iter->hmm_state; BaseFloat prob = prob_data[pdf_id * prob_stride + s], + post = post_data[pdf_id * post_stride + s], this_prev_alpha = prev_alpha_dash[prev_hmm_state * num_sequences + s], this_prev_alpha_smbr = prev_alpha_smbr[prev_hmm_state * num_sequences + s]; this_tot_alpha += this_prev_alpha * transition_prob * prob; - KALDI_ASSERT(num_posteriors_(t, pdf_id) > -1e-20); + KALDI_ASSERT(post > -1e-20); this_tot_alpha_smbr += - (this_prev_alpha_smbr + num_posteriors_(t, pdf_id)) + (this_prev_alpha_smbr + post) * this_prev_alpha * transition_prob * prob; } KALDI_ASSERT(this_tot_alpha - this_tot_alpha == 0); @@ -356,8 +364,11 @@ void DenominatorSmbrComputation::BetaSmbrGeneralFrame(int32 t) { const Int32Pair *forward_transitions = den_graph_.ForwardTransitions(); const DenominatorGraphTransition *transitions = den_graph_.Transitions(); // 'probs' is the matrix of pseudo-likelihoods for frame t. + // 'numerator_post' is the matrix of numerator posteriors for frame t. CuSubMatrix probs(exp_nnet_output_transposed_, 0, num_pdfs, t * num_sequences_, num_sequences_), + numerator_post(numerator_posteriors_transposed_, 0, num_pdfs, + t * num_sequences_, num_sequences_), log_prob_deriv(nnet_output_deriv_transposed_, 0, num_pdfs, t_wrapped * num_sequences_, num_sequences_); @@ -376,7 +387,8 @@ void DenominatorSmbrComputation::BetaSmbrGeneralFrame(int32 t) { dimGrid, dimBlock, forward_transitions, transitions, num_sequences, num_hmm_states, probs.Data(), probs.Stride(), - num_posteriors_.Row(t).Data(), tot_smbr_.Data(), + numerator_post.Data(), numerator_post.Stride(), + tot_smbr_.Data(), this_alpha_dash, this_alpha_smbr, next_beta, next_beta_smbr, this_beta_dash, this_beta_smbr, @@ -401,8 +413,10 @@ void DenominatorSmbrComputation::BetaSmbrGeneralFrame(int32 t) { #endif { int32 prob_stride = probs.Stride(), + post_stride = numerator_post.Stride(), deriv_stride = log_prob_deriv.Stride(); const BaseFloat *prob_data = probs.Data(); + const BaseFloat *post_data = numerator_post.Data(); BaseFloat *log_prob_deriv_data = log_prob_deriv.Data(); for (int32 h = 0; h < num_hmm_states; h++) { for (int32 s = 0; s < num_sequences; s++) { @@ -422,14 +436,14 @@ void DenominatorSmbrComputation::BetaSmbrGeneralFrame(int32 t) { next_hmm_state = trans_iter->hmm_state; BaseFloat next_beta_j = next_beta[next_hmm_state + num_sequences + s], next_beta_smbr_j = next_beta_smbr[next_hmm_state + num_sequences + s]; - BaseFloat variable_factor = transition_prob * next_beta_j * - prob_data[pdf_id * prob_stride + s]; - beta_smbr += (next_beta_smbr_j + num_posteriors_(t, pdf_id)) - * next_beta_j * prob_data[pdf_id * prob_stride + s] - * transition_prob; + BaseFloat prob = prob_data[pdf_id * prob_stride + s], + post = post_data[pdf_id * post_stride + s], + variable_factor = transition_prob * next_beta_j * prob; + beta_smbr += (next_beta_smbr_j + post) + * next_beta_j * prob * transition_prob; tot_variable_factor += variable_factor; double this_gamma_r = occupation_factor * next_beta_j - * transition_prob * (this_alpha_smbr_i + num_posteriors_(t, pdf_id) + * transition_prob * (this_alpha_smbr_i + post + next_beta_smbr_j - tot_smbr_(s)); log_prob_deriv_data[pdf_id * deriv_stride + s] += this_gamma_r; } diff --git a/src/chain/chain-kernels-ansi.h b/src/chain/chain-kernels-ansi.h index 88656f425f1..d37bcd83e5a 100644 --- a/src/chain/chain-kernels-ansi.h +++ b/src/chain/chain-kernels-ansi.h @@ -56,6 +56,7 @@ extern "C" { const BaseFloat *probs, int32_cuda prob_stride, const BaseFloat *num_post, + int32_cuda post_stride, const BaseFloat *tot_smbr, const BaseFloat *this_alpha, const BaseFloat *this_alpha_smbr, @@ -74,6 +75,7 @@ extern "C" { const BaseFloat *probs, int32_cuda prob_stride, const BaseFloat *num_post, + int32_cuda post_stride, const BaseFloat *prev_alpha, const BaseFloat *prev_alpha_smbr, BaseFloat *this_alpha, diff --git a/src/chain/chain-kernels.cu b/src/chain/chain-kernels.cu index aa9d36abcfb..2435229f361 100644 --- a/src/chain/chain-kernels.cu +++ b/src/chain/chain-kernels.cu @@ -265,13 +265,15 @@ static void _cuda_chain_hmm_backward(const Int32Pair *forward_transitions, // note that num_sequences == the number of elements in the minibatch, and we // insist they all have the same number of time steps. // note: 'probs' is indexed by sequence-index + (pdf-index * prob_stride). +// note: 'num_post' is indexed by sequence-index + (pdf-index * post_stride). __global__ static void _cuda_chain_smbr_hmm_forward( const Int32Pair *backward_transitions, const DenominatorGraphTransition *transitions, int32_cuda num_sequences, int32_cuda num_hmm_states, - const BaseFloat *probs, int32_cuda prob_stride, const BaseFloat *num_post, + const BaseFloat *probs, int32_cuda prob_stride, + const BaseFloat *num_post, int32_cuda post_stride, const BaseFloat *prev_alpha, const BaseFloat *prev_alpha_smbr, BaseFloat *this_alpha, BaseFloat *this_alpha_smbr) { // 'backward_transitions', indexed by hmm-state, consists of [start, end] @@ -321,19 +323,21 @@ static void _cuda_chain_smbr_hmm_forward( int32_cuda pdf_id1 = trans_iter[1].pdf_id, prev_hmm_state1 = trans_iter[1].hmm_state; BaseFloat pseudo_loglike0 = probs[pdf_id0 * prob_stride + s], + num_post0 = num_post[pdf_id0 * post_stride + s], this_prev_alpha0 = prev_alpha[prev_hmm_state0 * num_sequences + s], this_prev_alpha_smbr0 = prev_alpha_smbr[prev_hmm_state0 * num_sequences + s], pseudo_loglike1 = probs[pdf_id1 * prob_stride + s], + num_post1 = num_post[pdf_id1 * post_stride + s], this_prev_alpha1 = prev_alpha[prev_hmm_state1 * num_sequences + s], this_prev_alpha_smbr1 = prev_alpha[prev_hmm_state1 * num_sequences + s]; this_tot_alpha += this_prev_alpha0 * transition_prob0 * pseudo_loglike0 + this_prev_alpha1 * transition_prob1 * pseudo_loglike1; this_tot_alpha_smbr += - (this_prev_alpha_smbr0 + num_post[pdf_id0]) * this_prev_alpha0 + (this_prev_alpha_smbr0 + num_post0) * this_prev_alpha0 * transition_prob0 * pseudo_loglike0 - + (this_prev_alpha_smbr1 + num_post[pdf_id1] * this_prev_alpha1 + + (this_prev_alpha_smbr1 + num_post1 * this_prev_alpha1 * transition_prob1 * pseudo_loglike1; } if (trans_iter != trans_end) { @@ -347,7 +351,7 @@ static void _cuda_chain_smbr_hmm_forward( prev_alpha_smbr[prev_hmm_state0 * num_sequences + s]; this_tot_alpha += this_prev_alpha0 * transition_prob0 * pseudo_loglike0; this_tot_alpha_smbr += - (this_prev_alpha_smbr0 + num_post[pdf_id0]) * this_prev_alpha0 + (this_prev_alpha_smbr0 + num_post0) * this_prev_alpha0 * transition_prob0 * pseudo_loglike0; } @@ -366,7 +370,8 @@ static void _cuda_chain_smbr_hmm_backward( const DenominatorGraphTransition *transitions, int32_cuda num_sequences, int32_cuda num_hmm_states, const BaseFloat *probs, int32_cuda prob_stride, - const BaseFloat *num_post, const BaseFloat *tot_smbr, + const BaseFloat *num_post, int32_cuda post_stride, + const BaseFloat *tot_smbr, const BaseFloat *this_alpha, const BaseFloat *this_alpha_smbr, const BaseFloat *next_beta, const BaseFloat *next_beta_smbr, BaseFloat *this_beta, BaseFloat *this_beta_smbr, @@ -421,7 +426,8 @@ static void _cuda_chain_smbr_hmm_backward( next_beta_smbr_j1 = next_beta_smbr[next_hmm_state1 * num_sequences + s], prob0 = probs[pdf_id0 * prob_stride + s], prob1 = probs[pdf_id1 * prob_stride + s], - num_post0 = num_post[pdf_id0], num_post1 = num_post[pdf_id1]; + num_post0 = num_post[pdf_id0 * post_stride + s], + num_post1 = num_post[pdf_id1 * post_stride + s]; BaseFloat variable_factor0 = transition_prob0 * next_beta_j0 * prob0, variable_factor1 = transition_prob1 * next_beta_j1 * prob1; @@ -449,7 +455,7 @@ static void _cuda_chain_smbr_hmm_backward( BaseFloat next_beta_j0 = next_beta[next_hmm_state0 * num_sequences + s], next_beta_smbr_j0 = next_beta_smbr[next_hmm_state0 * num_sequences + s], prob0 = probs[pdf_id0 * prob_stride + s], - num_post0 = num_post[pdf_id0]; + num_post0 = num_post[pdf_id0 * post_stride + s]; BaseFloat variable_factor0 = transition_prob0 * next_beta_j0 * prob0; beta_smbr += (next_beta_smbr_j0 + num_post0) * next_beta_j0 * prob0 * transition_prob0; @@ -508,13 +514,13 @@ void cuda_chain_smbr_hmm_forward( int32_cuda num_sequences, int32_cuda num_hmm_states, const BaseFloat *probs, int32_cuda prob_stride, - const BaseFloat *num_post, + const BaseFloat *num_post, int32_cuda post_stride, const BaseFloat *prev_alpha, const BaseFloat *prev_alpha_smbr, BaseFloat *this_alpha, BaseFloat *this_alpha_smbr) { _cuda_chain_smbr_hmm_forward<<>>( backward_transitions, transitions, num_sequences, num_hmm_states, - probs, prob_stride, num_post, + probs, prob_stride, num_post, post_stride, prev_alpha, prev_alpha_smbr, this_alpha, this_alpha_smbr); } @@ -525,7 +531,8 @@ void cuda_chain_smbr_hmm_backward( int32_cuda num_sequences, int32_cuda num_hmm_states, const BaseFloat *probs, int32_cuda prob_stride, - const BaseFloat *num_post, const BaseFloat *tot_smbr, + const BaseFloat *num_post, int32_cuda post_stride, + const BaseFloat *tot_smbr, const BaseFloat *this_alpha, const BaseFloat *this_alpha_smbr, const BaseFloat *next_beta, const BaseFloat *next_beta_smbr, BaseFloat *this_beta, BaseFloat *this_beta_smbr, @@ -534,7 +541,7 @@ void cuda_chain_smbr_hmm_backward( _cuda_chain_smbr_hmm_backward<<>>( forward_transitions, transitions, num_sequences, num_hmm_states, - probs, prob_stride, num_post, tot_smbr, + probs, prob_stride, num_post, post_stride, tot_smbr, this_alpha, this_alpha_smbr, next_beta, next_beta_smbr, this_beta, this_beta_smbr, log_prob_deriv, log_prob_deriv_stride); diff --git a/src/chain/chain-training.cc b/src/chain/chain-training.cc index a1332156be9..3aebe3bfcfb 100644 --- a/src/chain/chain-training.cc +++ b/src/chain/chain-training.cc @@ -120,6 +120,11 @@ void ComputeChainSmbrObjfAndDeriv(const ChainTrainingOptions &opts, BaseFloat *weight, CuMatrixBase *nnet_output_deriv, CuMatrixBase *xent_output_deriv) { + // num_posteriors is a matrix of size + // (num_sequences * frames_per_sequence) x num_pdfs and is ordered in the + // same way as nnet_output is i.e. + // first the first frame of each sequence, then the second frame of + // each sequence, and so on. CuMatrix num_posteriors(nnet_output.NumRows(), nnet_output.NumCols()); { From 57d1016f6f4c461cdbe3b1fb98b69fb2491ba27d Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Thu, 22 Jun 2017 14:28:44 -0400 Subject: [PATCH 016/174] temp --- src/chain/chain-denominator-smbr.cc | 28 ++++++++++++++++------------ src/chain/chain-kernels.cu | 4 ++++ 2 files changed, 20 insertions(+), 12 deletions(-) diff --git a/src/chain/chain-denominator-smbr.cc b/src/chain/chain-denominator-smbr.cc index 4b705267fcd..19aceff44f2 100644 --- a/src/chain/chain-denominator-smbr.cc +++ b/src/chain/chain-denominator-smbr.cc @@ -190,6 +190,7 @@ void DenominatorSmbrComputation::AlphaSmbrGeneralFrame(int32 t) { this_tot_alpha_smbr += (this_prev_alpha_smbr + post) * this_prev_alpha * transition_prob * prob; + KALDI_ASSERT(this_tot_alpha_smbr - this_tot_alpha_smbr == 0); } KALDI_ASSERT(this_tot_alpha - this_tot_alpha == 0); KALDI_ASSERT(this_tot_alpha_smbr - this_tot_alpha_smbr == 0); @@ -197,6 +198,8 @@ void DenominatorSmbrComputation::AlphaSmbrGeneralFrame(int32 t) { if (this_tot_alpha > 0.0) { this_alpha_smbr[h * num_sequences + s] = this_tot_alpha_smbr / this_tot_alpha; + } else { + this_alpha_smbr[h * num_sequences + s] = 0.0; } } } @@ -468,9 +471,7 @@ void DenominatorSmbrComputation::BetaSmbrGeneralFrameDebug(int32 t) { CuSubMatrix this_log_prob_deriv( nnet_output_deriv_transposed_, 0, num_pdfs, t_wrapped * num_sequences_, num_sequences_); - BaseFloat alpha_beta_product = (VecVec(this_alpha_dash, this_beta_smbr) - + VecVec(this_alpha_smbr, this_beta_dash)) - / VecVec(this_alpha_dash, this_beta_dash), + BaseFloat alpha_beta_product = VecVec(this_alpha_dash, this_beta_dash), this_log_prob_deriv_sum = this_log_prob_deriv.Sum(); if (!ApproxEqual(alpha_beta_product, num_sequences_)) { KALDI_WARN << "On time " << t << ", alpha-beta product " @@ -482,17 +483,20 @@ void DenominatorSmbrComputation::BetaSmbrGeneralFrameDebug(int32 t) { ok_ = false; } } + + //BaseFloat acc = (VecVec(this_alpha_smbr, this_alpha_dash) + // + VecVec(this_beta_dash, this_beta_smbr)) + // / alpha_beta_product; // use higher tolerance, since we are using randomized pruning for the // log-prob derivatives. - if (!ApproxEqual(this_log_prob_deriv_sum, - num_sequences_, 0.01)) { - KALDI_WARN << "On time " << t << ", log-prob-deriv sum " - << this_log_prob_deriv_sum << " != " << num_sequences_; - if (fabs(this_log_prob_deriv_sum - num_sequences_) > 2.0) { - KALDI_WARN << "Excessive error detected, will abandon this minibatch"; - ok_ = false; - } - } + if (!ApproxEqual(this_log_prob_deriv_sum, 0, 0.01)) { + KALDI_WARN << "On time " << t << ", log-prob-deriv sum " + << this_log_prob_deriv_sum << " != " << 0; + if (fabs(this_log_prob_deriv_sum - 0) > 2.0) { + KALDI_WARN << "Excessive error detected, will abandon this minibatch"; + ok_ = false; + } + } } diff --git a/src/chain/chain-kernels.cu b/src/chain/chain-kernels.cu index 2435229f361..c7e207707d4 100644 --- a/src/chain/chain-kernels.cu +++ b/src/chain/chain-kernels.cu @@ -360,6 +360,8 @@ static void _cuda_chain_smbr_hmm_forward( if (this_tot_alpha > 0.0) { this_alpha_smbr[h * num_sequences + s] = this_tot_alpha_smbr / this_tot_alpha; + } else { + this_alpha_smbr[h * num_sequences + s] = 0.0; } } @@ -471,6 +473,8 @@ static void _cuda_chain_smbr_hmm_backward( if (beta > 0.0) this_beta_smbr[h * num_sequences + s] = beta_smbr / beta; + else + this_beta_smbr[h * num_sequences + s] = 0.0; } From a03b4010aa25769cbf4e6c6475f90dcfefb16246 Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Thu, 22 Jun 2017 14:27:08 -0400 Subject: [PATCH 017/174] smbr-dash --- .../nnet3/train/chain_objf/acoustic_model.py | 31 ++- src/chain/Makefile | 2 +- src/chain/chain-denominator-smbr.cc | 135 +++++---- src/chain/chain-denominator-smbr.h | 19 +- src/chain/chain-denominator.cc | 4 +- src/chain/chain-kernels.cu | 263 +----------------- src/chain/chain-supervision-test.cc | 218 ++++++++++++++- src/cudamatrix/cu-matrix.cc | 1 - src/cudamatrix/cu-matrix.h | 7 + src/cudamatrix/cu-vector.h | 5 + src/matrix/kaldi-matrix.cc | 17 ++ src/matrix/kaldi-matrix.h | 2 + 12 files changed, 352 insertions(+), 352 deletions(-) diff --git a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py index 3a9a90ed988..72a8cdd0f1c 100644 --- a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py +++ b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py @@ -131,7 +131,8 @@ def train_new_models(dir, iter, srand, num_jobs, momentum, max_param_change, shuffle_buffer_size, num_chunk_per_minibatch_str, frame_subsampling_factor, run_opts, - backstitch_training_scale=0.0, backstitch_training_interval=1): + backstitch_training_scale=0.0, backstitch_training_interval=1, + use_smbr_objective=False): """ Called from train_one_iteration(), this method trains new models with 'num_jobs' jobs, and @@ -212,7 +213,9 @@ def train_new_models(dir, iter, srand, num_jobs, raw_model=raw_model_string, egs_dir=egs_dir, archive_index=archive_index, buf_size=shuffle_buffer_size, - num_chunk_per_mb=num_chunk_per_minibatch_str), + num_chunk_per_mb=num_chunk_per_minibatch_str, + smbr_opt="--use-smbr-objective" + if use_smbr_objective is not None else ""), require_zero_status=True) threads.append(thread) @@ -234,7 +237,8 @@ def train_one_iteration(dir, iter, srand, egs_dir, momentum, max_param_change, shuffle_buffer_size, frame_subsampling_factor, run_opts, dropout_edit_string="", - backstitch_training_scale=0.0, backstitch_training_interval=1): + backstitch_training_scale=0.0, backstitch_training_interval=1, + use_smbr_objective=False): """ Called from steps/nnet3/chain/train.py for one iteration for neural network training with LF-MMI objective @@ -266,7 +270,8 @@ def train_one_iteration(dir, iter, srand, egs_dir, compute_train_cv_probabilities( dir=dir, iter=iter, egs_dir=egs_dir, l2_regularize=l2_regularize, xent_regularize=xent_regularize, - leaky_hmm_coefficient=leaky_hmm_coefficient, run_opts=run_opts) + leaky_hmm_coefficient=leaky_hmm_coefficient, run_opts=run_opts, + use_smbr_objective=use_smbr_objective) if iter > 0: # Runs in the background @@ -323,7 +328,8 @@ def train_one_iteration(dir, iter, srand, egs_dir, # first few iterations (hard-coded as 15) backstitch_training_scale=(backstitch_training_scale * iter / 15 if iter < 15 else backstitch_training_scale), - backstitch_training_interval=backstitch_training_interval) + backstitch_training_interval=backstitch_training_interval, + use_smbr_objective=use_smbr_objective) [models_to_average, best_model] = common_train_lib.get_successful_models( num_jobs, '{0}/log/train.{1}.%.log'.format(dir, iter)) @@ -450,7 +456,8 @@ def prepare_initial_acoustic_model(dir, run_opts, srand=-1): def compute_train_cv_probabilities(dir, iter, egs_dir, l2_regularize, xent_regularize, leaky_hmm_coefficient, - run_opts): + run_opts, + use_smbr_objective=False): model = '{0}/{1}.mdl'.format(dir, iter) common_lib.background_command( @@ -463,7 +470,10 @@ def compute_train_cv_probabilities(dir, iter, egs_dir, l2_regularize, """.format(command=run_opts.command, dir=dir, iter=iter, model=model, l2=l2_regularize, leaky=leaky_hmm_coefficient, xent_reg=xent_regularize, - egs_dir=egs_dir)) + egs_dir=egs_dir, + smbr_opt="--use-smbr-objective" + if use_smbr_objective else "")) + common_lib.background_command( """{command} {dir}/log/compute_prob_train.{iter}.log \ @@ -475,7 +485,9 @@ def compute_train_cv_probabilities(dir, iter, egs_dir, l2_regularize, """.format(command=run_opts.command, dir=dir, iter=iter, model=model, l2=l2_regularize, leaky=leaky_hmm_coefficient, xent_reg=xent_regularize, - egs_dir=egs_dir)) + egs_dir=egs_dir, + smbr_opt="--use-smbr-objective" + if use_smbr_objective else "")) def compute_progress(dir, iter, run_opts): @@ -565,4 +577,5 @@ def combine_models(dir, num_iters, models_to_combine, num_chunk_per_minibatch_st dir=dir, iter='final', egs_dir=egs_dir, l2_regularize=l2_regularize, xent_regularize=xent_regularize, leaky_hmm_coefficient=leaky_hmm_coefficient, - run_opts=run_opts) + run_opts=run_opts, + use_smbr_objective=use_smbr_objective) diff --git a/src/chain/Makefile b/src/chain/Makefile index 80bf66d4cb8..ca23450ae50 100644 --- a/src/chain/Makefile +++ b/src/chain/Makefile @@ -11,7 +11,7 @@ OBJFILES = chain-supervision.o chain-numerator.o chain-den-graph.o \ language-model.o chain-denominator.o chain-training.o \ chain-denominator-smbr.o ifeq ($(CUDA), true) - OBJFILES += chain-kernels.o + OBJFILES += chain-kernels.o chain-smbr-kernels.o endif LIBNAME = kaldi-chain diff --git a/src/chain/chain-denominator-smbr.cc b/src/chain/chain-denominator-smbr.cc index 19aceff44f2..c83d42daa0e 100644 --- a/src/chain/chain-denominator-smbr.cc +++ b/src/chain/chain-denominator-smbr.cc @@ -35,7 +35,7 @@ DenominatorSmbrComputation::DenominatorSmbrComputation( num_sequences_(num_sequences), frames_per_sequence_(nnet_output.NumRows() / num_sequences_), exp_nnet_output_transposed_(nnet_output, kTrans), - numerator_posteriors_transposed(numerator_posteriors, kTrans), + numerator_posteriors_transposed_(numerator_posteriors, kTrans), nnet_output_deriv_transposed_( exp_nnet_output_transposed_.NumRows(), std::min(exp_nnet_output_transposed_.NumCols(), @@ -45,16 +45,14 @@ DenominatorSmbrComputation::DenominatorSmbrComputation( den_graph_.NumStates() * num_sequences_ + num_sequences_, kUndefined), alpha_smbr_(frames_per_sequence_ + 1, - den_graph_.NumStates() * num_sequences_ + num_sequences_, - kUndefined), + den_graph_.NumStates() * num_sequences_ + num_sequences_), beta_(2, den_graph_.NumStates() * num_sequences_ + num_sequences_, kUndefined), - beta_smbr_(2, den_graph_.NumStates() * num_sequences_ + num_sequences_, - kUndefined), + beta_smbr_(2, den_graph_.NumStates() * num_sequences_ + num_sequences_), tot_prob_(num_sequences_, kUndefined), - tot_smbr_(num_sequences_, kUndefined), + tot_smbr_(num_sequences_), ok_(true) { - KALDI_ASSERT(opts_.leaky_hmm_coefficient > 0.0 && + KALDI_ASSERT(opts_.leaky_hmm_coefficient >= 0.0 && opts_.leaky_hmm_coefficient < 1.0); // make sure the alpha sums and beta sums are zeroed. alpha_.ColRange(den_graph_.NumStates() * num_sequences_, @@ -162,16 +160,6 @@ void DenominatorSmbrComputation::AlphaSmbrGeneralFrame(int32 t) { post_stride = numerator_post.Stride(); for (int32 h = 0; h < num_hmm_states; h++) { for (int32 s = 0; s < num_sequences; s++) { - // Let arbitrary_scale be the inverse of the alpha-sum value that we - // store in the same place we'd store the alpha for the state numbered - // 'num_hmm_states'. We multiply this into all the - // transition-probabilities from the previous frame to this frame, in - // both the forward and backward passes, in order to keep the alphas in - // a good numeric range. This won't affect the posteriors, but when - // computing the total likelihood we'll need to compensate for it later - // on. - BaseFloat arbitrary_scale = - 1.0 / prev_alpha_dash[num_hmm_states * num_sequences + s]; double this_tot_alpha = 0.0; double this_tot_alpha_smbr = 0.0; const DenominatorGraphTransition @@ -190,24 +178,28 @@ void DenominatorSmbrComputation::AlphaSmbrGeneralFrame(int32 t) { this_tot_alpha_smbr += (this_prev_alpha_smbr + post) * this_prev_alpha * transition_prob * prob; - KALDI_ASSERT(this_tot_alpha_smbr - this_tot_alpha_smbr == 0); } + // Let arbitrary_scale be the inverse of the alpha-sum value that we + // store in the same place we'd store the alpha for the state numbered + // 'num_hmm_states'. We multiply this into all the + // transition-probabilities from the previous frame to this frame, in + // both the forward and backward passes, in order to keep the alphas in + // a good numeric range. This won't affect the posteriors, but when + // computing the total likelihood we'll need to compensate for it later + // on. + BaseFloat arbitrary_scale = + 1.0 / prev_alpha_dash[num_hmm_states * num_sequences + s]; KALDI_ASSERT(this_tot_alpha - this_tot_alpha == 0); KALDI_ASSERT(this_tot_alpha_smbr - this_tot_alpha_smbr == 0); this_alpha[h * num_sequences + s] = this_tot_alpha * arbitrary_scale; - if (this_tot_alpha > 0.0) { - this_alpha_smbr[h * num_sequences + s] = - this_tot_alpha_smbr / this_tot_alpha; - } else { - this_alpha_smbr[h * num_sequences + s] = 0.0; - } + this_alpha_smbr[h * num_sequences + s] = + this_tot_alpha_smbr * arbitrary_scale; } } } } - -void DenominatorSmbrComputation::AlphaDash(int32 t) { +void DenominatorSmbrComputation::AlphaSmbrDash(int32 t) { BaseFloat *this_alpha = alpha_.RowData(t); // create a 'fake matrix' for the regular alphas- view this row as a matrix. @@ -217,20 +209,27 @@ void DenominatorSmbrComputation::AlphaDash(int32 t) { num_sequences_, num_sequences_); - // the alpha-dash is the sum of alpha over all states. + // Compute the sum of alpha over all states i for the current time. + // This is done for each sequence and stored in the last 'num_sequences_' + // columns. CuSubVector alpha_sum_vec(this_alpha + den_graph_.NumStates() * num_sequences_, num_sequences_); alpha_sum_vec.AddRowSumMat(1.0, alpha_mat, 0.0); + BaseFloat alpha_sum = alpha_sum_vec.Sum(); + KALDI_VLOG(2) << "alpha-sum for time " << t << " is " << alpha_sum; + KALDI_ASSERT(alpha_sum_vec.Min() > 0); + alpha_mat.AddVecVec(opts_.leaky_hmm_coefficient, den_graph_.InitialProbs(), alpha_sum_vec); // it's now alpha-dash. + alpha_smbr_.Row(t).DivElements(alpha_.Row(t)); } // compute beta from beta-dash. -void DenominatorSmbrComputation::Beta(int32 t) { +void DenominatorSmbrComputation::BetaSmbr(int32 t) { BaseFloat *this_beta_dash = beta_.RowData(t % 2); // create a 'fake matrix' for the regular beta-dash (which is // the counterpart of alpha-dash)- view this row as a matrix. @@ -250,15 +249,16 @@ void DenominatorSmbrComputation::Beta(int32 t) { // will contain the actual beta (i.e. the counterpart of alpha), // not the beta-dash. beta_dash_mat.AddVecToRows(1.0, beta_dash_sum_vec); + beta_smbr_.Row(t % 2).DivElements(beta_.Row(t % 2)); } BaseFloat DenominatorSmbrComputation::ForwardSmbr() { AlphaFirstFrame(); AlphaSmbrFirstFrame(); - AlphaDash(0); + AlphaSmbrDash(0); for (int32 t = 1; t <= frames_per_sequence_; t++) { AlphaSmbrGeneralFrame(t); - AlphaDash(t); + AlphaSmbrDash(t); } return ComputeTotObjf(); } @@ -272,14 +272,21 @@ BaseFloat DenominatorSmbrComputation::ComputeTotObjf() { den_graph_.NumStates(), num_sequences_, num_sequences_); - CuSubMatrix last_alpha_smbr( + CuMatrix last_alpha_smbr(CuSubMatrix ( alpha_smbr_.RowData(frames_per_sequence_), den_graph_.NumStates(), num_sequences_, - num_sequences_); + num_sequences_)); // TODO: Make this vector multiplication + // Sum over all the HMM states for each sequence. tot_prob_.AddRowSumMat(1.0, last_alpha_dash, 0.0); + + BaseFloat prob_sum = tot_prob_.Sum(); + KALDI_ASSERT(prob_sum == prob_sum); + + // Take weighted-average of the SMBR quantitites over all the + // HMM states for each sequence. last_alpha_smbr.MulElements(last_alpha_dash); tot_smbr_.AddRowSumMat(1.0, last_alpha_smbr, 0.0); tot_smbr_.DivElements(tot_prob_); @@ -293,12 +300,12 @@ bool DenominatorSmbrComputation::BackwardSmbr( BaseFloat deriv_weight, CuMatrixBase *nnet_output_deriv) { BetaDashLastFrame(); - Beta(frames_per_sequence_); + BetaSmbr(frames_per_sequence_); for (int32 t = frames_per_sequence_ - 1; t >= 0; t--) { BetaSmbrGeneralFrame(t); if (GetVerboseLevel() >= 1 || t == 0) BetaSmbrGeneralFrameDebug(t); - Beta(t); + BetaSmbr(t); if (t % kMaxDerivTimeSteps == 0) { // commit the derivative stored in exp_nnet_output_transposed_ by adding // its transpose to the appropriate sub-matrix of 'nnet_output_deriv'. @@ -425,9 +432,9 @@ void DenominatorSmbrComputation::BetaSmbrGeneralFrame(int32 t) { for (int32 s = 0; s < num_sequences; s++) { BaseFloat this_alpha_dash_prob = this_alpha_dash[h * num_sequences + s], this_alpha_smbr_i = this_alpha_smbr[h * num_sequences + s], - inv_arbitrary_scale = + inv_arbitrary_scale = this_alpha_dash[num_hmm_states * num_sequences + s]; - double tot_variable_factor = 0.0, beta_smbr = 0.0; + double tot_variable_factor = 0.0, tot_beta_smbr = 0.0; BaseFloat occupation_factor = this_alpha_dash_prob / inv_arbitrary_scale; const DenominatorGraphTransition @@ -437,23 +444,21 @@ void DenominatorSmbrComputation::BetaSmbrGeneralFrame(int32 t) { BaseFloat transition_prob = trans_iter->transition_prob; int32 pdf_id = trans_iter->pdf_id, next_hmm_state = trans_iter->hmm_state; - BaseFloat next_beta_j = next_beta[next_hmm_state + num_sequences + s], - next_beta_smbr_j = next_beta_smbr[next_hmm_state + num_sequences + s]; + BaseFloat next_beta_j = next_beta[next_hmm_state * num_sequences + s], + next_beta_smbr_j = next_beta_smbr[next_hmm_state * num_sequences + s]; BaseFloat prob = prob_data[pdf_id * prob_stride + s], post = post_data[pdf_id * post_stride + s], variable_factor = transition_prob * next_beta_j * prob; - beta_smbr += (next_beta_smbr_j + post) - * next_beta_j * prob * transition_prob; + tot_beta_smbr += (next_beta_smbr_j + post) * variable_factor; tot_variable_factor += variable_factor; - double this_gamma_r = occupation_factor * next_beta_j - * transition_prob * (this_alpha_smbr_i + post - + next_beta_smbr_j - tot_smbr_(s)); + double this_gamma_r = occupation_factor * variable_factor * + (this_alpha_smbr_i + post + next_beta_smbr_j - tot_smbr_(s)); log_prob_deriv_data[pdf_id * deriv_stride + s] += this_gamma_r; } this_beta_dash[h * num_sequences + s] = tot_variable_factor / inv_arbitrary_scale; - if (tot_variable_factor > 0.0) - this_beta_smbr[h * num_sequences + s] = beta_smbr / tot_variable_factor; + this_beta_smbr[h * num_sequences + s] = + tot_beta_smbr / inv_arbitrary_scale; } } } @@ -471,9 +476,10 @@ void DenominatorSmbrComputation::BetaSmbrGeneralFrameDebug(int32 t) { CuSubMatrix this_log_prob_deriv( nnet_output_deriv_transposed_, 0, num_pdfs, t_wrapped * num_sequences_, num_sequences_); - BaseFloat alpha_beta_product = VecVec(this_alpha_dash, this_beta_dash), + BaseFloat alpha_beta_product = VecVec(this_alpha_dash, + this_beta_dash), this_log_prob_deriv_sum = this_log_prob_deriv.Sum(); - if (!ApproxEqual(alpha_beta_product, num_sequences_)) { + if (GetVerboseLevel() > 1 || !ApproxEqual(alpha_beta_product, num_sequences_)) { KALDI_WARN << "On time " << t << ", alpha-beta product " << alpha_beta_product << " != " << num_sequences_ << " alpha-dash-sum = " << this_alpha_dash.Sum() @@ -484,19 +490,38 @@ void DenominatorSmbrComputation::BetaSmbrGeneralFrameDebug(int32 t) { } } + // alpha_smbr_vec is a vector of size 'num_hmm_states' * 'num_sequences_' + CuVector alpha_beta_smbr_vec(this_beta_smbr); + alpha_beta_smbr_vec.DivElements(this_beta_dash); + alpha_beta_smbr_vec.AddVec(1.0, this_beta_smbr, 1.0); + + CuVector alpha_beta_vec(this_alpha_dash); + alpha_beta_vec.MulElements(this_beta_dash); + + alpha_beta_smbr_vec.MulElements(alpha_beta_vec); + + BaseFloat alpha_beta_smbr_sum = alpha_beta_smbr_vec.Sum() + / alpha_beta_product * num_sequences_, + tot_smbr_sum = tot_smbr_.Sum(); + KALDI_ASSERT (alpha_beta_smbr_sum - alpha_beta_smbr_sum == 0.0); + if (GetVerboseLevel() > 1 || !ApproxEqual(tot_smbr_sum, alpha_beta_smbr_sum, 0.01)) { + KALDI_WARN << "On time " << t << ", alpha-beta-smbr " + << alpha_beta_smbr_sum << " != " << tot_smbr_sum; + } + //BaseFloat acc = (VecVec(this_alpha_smbr, this_alpha_dash) // + VecVec(this_beta_dash, this_beta_smbr)) // / alpha_beta_product; // use higher tolerance, since we are using randomized pruning for the // log-prob derivatives. - if (!ApproxEqual(this_log_prob_deriv_sum, 0, 0.01)) { - KALDI_WARN << "On time " << t << ", log-prob-deriv sum " - << this_log_prob_deriv_sum << " != " << 0; - if (fabs(this_log_prob_deriv_sum - 0) > 2.0) { - KALDI_WARN << "Excessive error detected, will abandon this minibatch"; - ok_ = false; - } - } + ///if (!ApproxEqual(this_log_prob_deriv_sum, 0, 0.01)) { + /// KALDI_WARN << "On time " << t << ", log-prob-deriv sum " + /// << this_log_prob_deriv_sum << " != " << 0; + /// if (fabs(this_log_prob_deriv_sum - 0) > 2.0) { + /// KALDI_WARN << "Excessive error detected, will abandon this minibatch"; + /// ok_ = false; + /// } + ///} } diff --git a/src/chain/chain-denominator-smbr.h b/src/chain/chain-denominator-smbr.h index c8b1855664c..0fd52cf178d 100644 --- a/src/chain/chain-denominator-smbr.h +++ b/src/chain/chain-denominator-smbr.h @@ -67,12 +67,14 @@ namespace chain { * Forward computation (version 1) - In the forward computation we're computing alpha(i, t) for 0 <= t <= T): + In the forward computation we're computing alpha(i, t) and alpha_r(i, t) + for 0 <= t <= T): - For the first frame, set alpha(0, i) = init(i), where init(i) is the initial-probabilitiy from state i. # in our framework these are obtained # by running the HMM for a while and getting an averaged occupation # probability, and using this as an initial-prob, since the boundaries of # chunks don't really correspond to utterance boundaries in general.] + Also set alpha_r(0, i) = 0. - For t = 1 ... T: for i = 0 ... I-1: alpha(t, i) = 0 @@ -158,7 +160,7 @@ namespace chain { Let leaky-hmm-prob be a constant defined by the user, with 0.1 being a typical value. It defines how much probability we give to the 'leaky' transitions. - - For frame 0, set alpha(0, i) = init(i). + - For frame 0, set alpha(0, i) = init(i), alpha_r(0, i) = 0 - For 0 <= t <= T, define tot-alpha(t) = \sum_i alpha(t, i). - For 0 <= t <= T, define alpha'(t, i) = alpha(t, i) + tot-alpha(t) * leaky-hmm-prob * init(i). @@ -168,7 +170,7 @@ namespace chain { alpha_r(t, i) = 0 for (j, p, n) in pred(i): # note: j is preceding-state. alpha(t, i) += alpha'(t-1, j) * p * x(t-1, n) / tot-alpha(t-1) - alpha_r(t, i) += (alpha_r(t-1, j) + (ref_pdf == pdf ? 1.0 : 0.0)) * alpha'(t-1, j) * x(t-1, n) / tot-alpha(t-1) * p + alpha_r(t, i) += (alpha_r(t-1, j) + (ref_pdf == pdf ? 1.0 : 0.0)) * alpha'(t-1, j) * p * x(t-1, n) / tot-alpha(t-1) alpha_r(t, i) /= alpha(t,i) - total-prob = \sum_i alpha'(T, i) @@ -253,7 +255,7 @@ class DenominatorSmbrComputation { void AlphaSmbrGeneralFrame(int32 t); // does the 'alpha-dash' computation for time t. this relates to // 'leaky hmm'. - void AlphaDash(int32 t); + void AlphaSmbrDash(int32 t); // done after all the alphas, this function computes and returns the total // smbr objective summed over all the sequences, and sets tot_prob_ (if we're @@ -267,7 +269,7 @@ class DenominatorSmbrComputation { // beta computation for 0 <= beta < num_time_steps_. void BetaSmbrGeneralFrame(int32 t); // compute the beta quantity from the beta-dash quantity (relates to leaky hmm). - void Beta(int32 t); + void BetaSmbr(int32 t); // some checking that we can do if debug mode is activated, or on frame zero. // Sets ok_ to false if a bad problem is detected. @@ -290,9 +292,10 @@ class DenominatorSmbrComputation { // num_sequences + sequence_index). CuMatrix exp_nnet_output_transposed_; - // the numberator posterior probabilities - // This is a matrix of size num_sequences x num_pdfs - CuMatrix num_posteriors_; + // the numerator posterior probabilities + // The row-index is the pdf-id; and the column index equals (frame_index * + // num_sequences + sequence_index). + CuMatrix numerator_posteriors_transposed_; // the derivs w.r.t. the nnet outputs (transposed) CuMatrix nnet_output_deriv_transposed_; diff --git a/src/chain/chain-denominator.cc b/src/chain/chain-denominator.cc index 2b27d4b9176..1e2e418991c 100644 --- a/src/chain/chain-denominator.cc +++ b/src/chain/chain-denominator.cc @@ -48,7 +48,7 @@ DenominatorComputation::DenominatorComputation( tot_log_prob_(num_sequences_, kUndefined), log_correction_term_(num_sequences_, kUndefined), ok_(true) { - KALDI_ASSERT(opts_.leaky_hmm_coefficient > 0.0 && + KALDI_ASSERT(opts_.leaky_hmm_coefficient >= 0.0 && opts_.leaky_hmm_coefficient < 1.0); // make sure the alpha sums and beta sums are zeroed. alpha_.ColRange(den_graph_.NumStates() * num_sequences_, @@ -401,7 +401,7 @@ void DenominatorComputation::BetaGeneralFrameDebug(int32 t) { BaseFloat alpha_beta_product = VecVec(this_alpha_dash, this_beta_dash), this_log_prob_deriv_sum = this_log_prob_deriv.Sum(); - if (!ApproxEqual(alpha_beta_product, num_sequences_)) { + if (true || GetVerboseLevel() > 1 || !ApproxEqual(alpha_beta_product, num_sequences_)) { KALDI_WARN << "On time " << t << ", alpha-beta product " << alpha_beta_product << " != " << num_sequences_ << " alpha-dash-sum = " << this_alpha_dash.Sum() diff --git a/src/chain/chain-kernels.cu b/src/chain/chain-kernels.cu index c7e207707d4..8ed7bd54a98 100644 --- a/src/chain/chain-kernels.cu +++ b/src/chain/chain-kernels.cu @@ -169,7 +169,7 @@ static void _cuda_chain_hmm_forward(const Int32Pair *backward_transitions, // range. This won't affect the posteriors, as it's just a constant factor // for each frame, but when computing the total likelihood we'll need to // compensate for it later on. - BaseFloat arbitrary_scale = + BaseFloat arbitrary_scale = 1.0 / prev_alpha[num_hmm_states * num_sequences + s]; this_alpha[h * num_sequences + s] = this_tot_alpha * arbitrary_scale; } @@ -256,227 +256,6 @@ static void _cuda_chain_hmm_backward(const Int32Pair *forward_transitions, } -// one iteration of the forward computation in the chain HMM with -// SMBR objective. -// The grid y determines which HMM-state we handle. [put this in the grid because -// HMM-states don't all take the same amount of time in the backwards direction, and it's -// better for scheduling to have them at the outer level.] -// The block x and grid x determine which sequence (0 ... num_sequences - 1) we handle; -// note that num_sequences == the number of elements in the minibatch, and we -// insist they all have the same number of time steps. -// note: 'probs' is indexed by sequence-index + (pdf-index * prob_stride). -// note: 'num_post' is indexed by sequence-index + (pdf-index * post_stride). -__global__ -static void _cuda_chain_smbr_hmm_forward( - const Int32Pair *backward_transitions, - const DenominatorGraphTransition *transitions, - int32_cuda num_sequences, - int32_cuda num_hmm_states, - const BaseFloat *probs, int32_cuda prob_stride, - const BaseFloat *num_post, int32_cuda post_stride, - const BaseFloat *prev_alpha, const BaseFloat *prev_alpha_smbr, - BaseFloat *this_alpha, BaseFloat *this_alpha_smbr) { - // 'backward_transitions', indexed by hmm-state, consists of [start, end] - // indexes into the 'transitions' array. This gives us the info for - // transitions *into* this state. 'probs' contains the exponentiated neural - // net outputs; it has dimension num-output-indexes by num_sequences and its - // stride is 'prob_stride'. 'prev_alpha' and 'this_alpha', which are - // extracted from a larger matrix, both have dimension num-history-states by - // num-sequences. 'prev_alpha_smbr' and 'this_alpha_smbr' are analogous - // for the partial SMBR values. - - // s is the index of the sequence within the minibatch, - // from 0 .. num-egs-in-this-minibatch - 1. - // h is the hmm-state index. - int32_cuda s = threadIdx.x + blockIdx.x * blockDim.x, - h = blockIdx.y; - if (s >= num_sequences) - return; - - // Let arbitrary_scale be the inverse of the sum of all alpha values on-- the - // previous frame this sum of all the alpha values is stored in the place that - // we'd store the previous alpha for state-index equal to num_hmm_states - // (i.e. one past the end). We multiply this into all the - // transition-probabilities from the previous frame to this frame, in both the - // forward and backward passes, in order to keep the alphas in a good numeric - // range. This won't affect the posteriors, as it's just a constant factor - // for each frame, but when computing the total likelihood we'll need to - // compensate for it later on. - BaseFloat arbitrary_scale = - 1.0 / prev_alpha[num_hmm_states * num_sequences + s]; - - double this_tot_alpha = 0.0, this_tot_alpha_smbr = 0.0; - const DenominatorGraphTransition - *trans_iter = transitions + backward_transitions[h].first, - *trans_end = transitions + backward_transitions[h].second; - // Note: regarding this loop unrolling, I tried the automatic unrolling using - // #pragma unroll 2 (after modifying the loop to have an integer index), but I - // did not see any performance improvement, it was slightly slower. So the - // compiler must be doing something different than what I'm doing here. - const int loop_unroll = 2; // don't change this without changing the code - // below. - for (; trans_iter + loop_unroll <= trans_end; trans_iter += loop_unroll) { - BaseFloat transition_prob0 = trans_iter[0].transition_prob; - int32_cuda pdf_id0 = trans_iter[0].pdf_id, - prev_hmm_state0 = trans_iter[0].hmm_state; - BaseFloat transition_prob1 = trans_iter[1].transition_prob; - int32_cuda pdf_id1 = trans_iter[1].pdf_id, - prev_hmm_state1 = trans_iter[1].hmm_state; - BaseFloat pseudo_loglike0 = probs[pdf_id0 * prob_stride + s], - num_post0 = num_post[pdf_id0 * post_stride + s], - this_prev_alpha0 = prev_alpha[prev_hmm_state0 * num_sequences + s], - this_prev_alpha_smbr0 = - prev_alpha_smbr[prev_hmm_state0 * num_sequences + s], - pseudo_loglike1 = probs[pdf_id1 * prob_stride + s], - num_post1 = num_post[pdf_id1 * post_stride + s], - this_prev_alpha1 = prev_alpha[prev_hmm_state1 * num_sequences + s], - this_prev_alpha_smbr1 = prev_alpha[prev_hmm_state1 * num_sequences + s]; - - this_tot_alpha += this_prev_alpha0 * transition_prob0 * pseudo_loglike0 + - this_prev_alpha1 * transition_prob1 * pseudo_loglike1; - this_tot_alpha_smbr += - (this_prev_alpha_smbr0 + num_post0) * this_prev_alpha0 - * transition_prob0 * pseudo_loglike0 - + (this_prev_alpha_smbr1 + num_post1 * this_prev_alpha1 - * transition_prob1 * pseudo_loglike1; - } - if (trans_iter != trans_end) { - // mop up the odd transition. - BaseFloat transition_prob0 = trans_iter[0].transition_prob; - int32_cuda pdf_id0 = trans_iter[0].pdf_id, - prev_hmm_state0 = trans_iter[0].hmm_state; - BaseFloat pseudo_loglike0 = probs[pdf_id0 * prob_stride + s], - this_prev_alpha0 = prev_alpha[prev_hmm_state0 * num_sequences + s], - this_prev_alpha_smbr0 = - prev_alpha_smbr[prev_hmm_state0 * num_sequences + s]; - this_tot_alpha += this_prev_alpha0 * transition_prob0 * pseudo_loglike0; - this_tot_alpha_smbr += - (this_prev_alpha_smbr0 + num_post0) * this_prev_alpha0 - * transition_prob0 * pseudo_loglike0; - } - - this_alpha[h * num_sequences + s] = this_tot_alpha * arbitrary_scale; - - if (this_tot_alpha > 0.0) { - this_alpha_smbr[h * num_sequences + s] = - this_tot_alpha_smbr / this_tot_alpha; - } else { - this_alpha_smbr[h * num_sequences + s] = 0.0; - } -} - - -__global__ -static void _cuda_chain_smbr_hmm_backward( - const Int32Pair *forward_transitions, - const DenominatorGraphTransition *transitions, - int32_cuda num_sequences, int32_cuda num_hmm_states, - const BaseFloat *probs, int32_cuda prob_stride, - const BaseFloat *num_post, int32_cuda post_stride, - const BaseFloat *tot_smbr, - const BaseFloat *this_alpha, const BaseFloat *this_alpha_smbr, - const BaseFloat *next_beta, const BaseFloat *next_beta_smbr, - BaseFloat *this_beta, BaseFloat *this_beta_smbr, - BaseFloat *log_prob_deriv, int32_cuda log_prob_deriv_stride) { - // 'forward_transitions', indexed by hmm-state, consists of [start, end] - // indexes into the 'transition_info' array. This is about the transitions - // *out of* this state. 'probs' contains the exponentiated neural net - // outputs; it has dimension num-output-indexes by num_sequences, and contains - // just the observation probabilities for this time index. Its stride is - // prob_stride. - // 'this_alpha', 'next_beta' and 'this_beta' all have dimension - // num-history-states by num-sequences. - // 'this_alpha_smbr', 'next_beta_smbr', and 'this_beta_smbr' are - // analogous quantities storing values for SMBR objective. - // The beta probs are normalized in such a way (by multiplying by 1/(total-data-prob)) - // that to get occupation counts we don't need to multiply by 1/total-data-prob. - // deriv_scale is a factor (e.g. -1.0 or -0.99) that we multiply these derivs by - // while accumulating them. - - // s is the index of the sequence within the minibatch, - // from 0 .. num-egs-in-this-minibatch - 1. - // h is the hmm-state index. - int32_cuda s = threadIdx.x + blockIdx.x * blockDim.x, - h = blockIdx.y; - if (s >= num_sequences) - return; - - // See where arbitrary_scale is defined in the forward computation above, for - // more explanation of inv_arbitrary_scale. - BaseFloat this_alpha_prob = this_alpha[h * num_sequences + s], - this_alpha_smbr_i = this_alpha_smbr[h * num_sequences + s], - inv_arbitrary_scale = - this_alpha[num_hmm_states * num_sequences + s]; - double tot_variable_factor = 0.0, beta_smbr = 0.0; - - BaseFloat occupation_factor = this_alpha_prob / inv_arbitrary_scale; - const DenominatorGraphTransition - *trans_iter = transitions + forward_transitions[h].first, - *trans_end = transitions + forward_transitions[h].second; - const int loop_unroll = 2; // don't change this without changing the code - // below. - for (; trans_iter + loop_unroll <= trans_end; trans_iter += loop_unroll) { - BaseFloat transition_prob0 = trans_iter[0].transition_prob; - int32_cuda pdf_id0 = trans_iter[0].pdf_id, - next_hmm_state0 = trans_iter[0].hmm_state; - BaseFloat transition_prob1 = trans_iter[1].transition_prob; - int32_cuda pdf_id1 = trans_iter[1].pdf_id, - next_hmm_state1 = trans_iter[1].hmm_state; - BaseFloat next_beta_j0 = next_beta[next_hmm_state0 * num_sequences + s], - next_beta_smbr_j0 = next_beta_smbr[next_hmm_state0 * num_sequences + s], - next_beta_j1 = next_beta[next_hmm_state1 * num_sequences + s], - next_beta_smbr_j1 = next_beta_smbr[next_hmm_state1 * num_sequences + s], - prob0 = probs[pdf_id0 * prob_stride + s], - prob1 = probs[pdf_id1 * prob_stride + s], - num_post0 = num_post[pdf_id0 * post_stride + s], - num_post1 = num_post[pdf_id1 * post_stride + s]; - - BaseFloat variable_factor0 = transition_prob0 * next_beta_j0 * prob0, - variable_factor1 = transition_prob1 * next_beta_j1 * prob1; - beta_smbr += (next_beta_smbr_j0 + num_post0) * next_beta_j0 - * prob0 * transition_prob0 - + (next_beta_smbr_j1 + num_post1) * next_beta_j1 - * prob1 * transition_prob1; - tot_variable_factor += variable_factor0 + variable_factor1; - BaseFloat this_gamma_r0 = occupation_factor * next_beta_j0 - * transition_prob0 * (this_alpha_smbr_i + num_post0 - + next_beta_smbr_j0 - tot_smbr[s]); - atomic_add_thresholded(log_prob_deriv + (pdf_id0 * log_prob_deriv_stride + s), - this_gamma_r0); - BaseFloat this_gamma_r1 = occupation_factor * next_beta_j1 - * transition_prob1 * (this_alpha_smbr_i + num_post1 - + next_beta_smbr_j1 - tot_smbr[s]); - atomic_add_thresholded(log_prob_deriv + (pdf_id1 * log_prob_deriv_stride + s), - this_gamma_r1); - } - if (trans_iter != trans_end) { - // mop up the odd transition. - BaseFloat transition_prob0 = trans_iter[0].transition_prob; - int32_cuda pdf_id0 = trans_iter[0].pdf_id, - next_hmm_state0 = trans_iter[0].hmm_state; - BaseFloat next_beta_j0 = next_beta[next_hmm_state0 * num_sequences + s], - next_beta_smbr_j0 = next_beta_smbr[next_hmm_state0 * num_sequences + s], - prob0 = probs[pdf_id0 * prob_stride + s], - num_post0 = num_post[pdf_id0 * post_stride + s]; - BaseFloat variable_factor0 = transition_prob0 * next_beta_j0 * prob0; - beta_smbr += (next_beta_smbr_j0 + num_post0) * next_beta_j0 - * prob0 * transition_prob0; - tot_variable_factor += variable_factor0; - BaseFloat this_gamma_r0 = occupation_factor * next_beta_j0 - * transition_prob0 * (this_alpha_smbr_i + num_post0 - + next_beta_smbr_j0 - tot_smbr[s]); - atomic_add_thresholded(log_prob_deriv + (pdf_id0 * log_prob_deriv_stride + s), - this_gamma_r0); - } - BaseFloat beta = tot_variable_factor / inv_arbitrary_scale; - this_beta[h * num_sequences + s] = beta; - - if (beta > 0.0) - this_beta_smbr[h * num_sequences + s] = beta_smbr / beta; - else - this_beta_smbr[h * num_sequences + s] = 0.0; -} - void cuda_chain_hmm_forward(dim3 Gr, dim3 Bl, const Int32Pair *backward_transitions, @@ -510,43 +289,3 @@ void cuda_chain_hmm_backward(dim3 Gr, dim3 Bl, log_prob_deriv_stride); } -// Chain forward with SMBR objective -void cuda_chain_smbr_hmm_forward( - dim3 Gr, dim3 Bl, - const Int32Pair *backward_transitions, - const DenominatorGraphTransition *transitions, - int32_cuda num_sequences, - int32_cuda num_hmm_states, - const BaseFloat *probs, int32_cuda prob_stride, - const BaseFloat *num_post, int32_cuda post_stride, - const BaseFloat *prev_alpha, const BaseFloat *prev_alpha_smbr, - BaseFloat *this_alpha, BaseFloat *this_alpha_smbr) { - _cuda_chain_smbr_hmm_forward<<>>( - backward_transitions, transitions, - num_sequences, num_hmm_states, - probs, prob_stride, num_post, post_stride, - prev_alpha, prev_alpha_smbr, this_alpha, this_alpha_smbr); -} - -void cuda_chain_smbr_hmm_backward( - dim3 Gr, dim3 Bl, - const Int32Pair *forward_transitions, - const DenominatorGraphTransition *transitions, - int32_cuda num_sequences, - int32_cuda num_hmm_states, - const BaseFloat *probs, int32_cuda prob_stride, - const BaseFloat *num_post, int32_cuda post_stride, - const BaseFloat *tot_smbr, - const BaseFloat *this_alpha, const BaseFloat *this_alpha_smbr, - const BaseFloat *next_beta, const BaseFloat *next_beta_smbr, - BaseFloat *this_beta, BaseFloat *this_beta_smbr, - BaseFloat *log_prob_deriv, - int32_cuda log_prob_deriv_stride) { - _cuda_chain_smbr_hmm_backward<<>>( - forward_transitions, transitions, - num_sequences, num_hmm_states, - probs, prob_stride, num_post, post_stride, tot_smbr, - this_alpha, this_alpha_smbr, next_beta, next_beta_smbr, - this_beta, this_beta_smbr, log_prob_deriv, - log_prob_deriv_stride); -} diff --git a/src/chain/chain-supervision-test.cc b/src/chain/chain-supervision-test.cc index 0c8d9f68aad..da56ab3152d 100644 --- a/src/chain/chain-supervision-test.cc +++ b/src/chain/chain-supervision-test.cc @@ -26,7 +26,7 @@ #include "chain/chain-den-graph.h" #include "chain/chain-denominator.h" #include "hmm/hmm-utils.h" - +#include namespace kaldi { @@ -260,15 +260,137 @@ void ChainTrainingTest(const DenominatorGraph &den_graph, BaseFloat objf, l2_term, weight; - if (RandInt(0, 1) == 1) { + ComputeChainObjfAndDeriv(opts, den_graph, supervision, + nnet_output, &objf, &l2_term, &weight, + &nnet_output_deriv); + + { + // make sure each row of nnet_output_deriv sums to one (shift invariance of + // the nnet output). + CuVector nnet_output_deriv_row_sums(nnet_output_deriv.NumRows()); + nnet_output_deriv_row_sums.AddColSumMat(1.0, nnet_output_deriv, 0.0); + KALDI_ASSERT(nnet_output_deriv_row_sums.Norm(2.0) < 0.1); + } + + KALDI_LOG << "Chain objf per frame is " << (objf / weight) + << " over " << weight << " frames (weighted)"; + + { // a check + BaseFloat output_deriv_sum = nnet_output_deriv.Sum(); + KALDI_LOG << "Sum of nnet-output-deriv is " << output_deriv_sum + << " vs. expected 0."; + KALDI_ASSERT(output_deriv_sum < 0.2); + } + + KALDI_ASSERT(objf <= 0.0); + + int32 num_tries = 5; + BaseFloat epsilon = 1.0e-04; + Vector predicted_objf_changes(num_tries), + observed_objf_changes(num_tries); + for (int32 p = 0; p < num_tries; p++) { + CuMatrix nnet_delta_output(nnet_output.NumRows(), + nnet_output.NumCols()); + nnet_delta_output.SetRandn(); + nnet_delta_output.Scale(epsilon); + predicted_objf_changes(p) = TraceMatMat(nnet_output_deriv, + nnet_delta_output, kTrans); + CuMatrix nnet_output_perturbed(nnet_delta_output); + nnet_output_perturbed.AddMat(1.0, nnet_output); + + BaseFloat objf_modified, l2_term_modified, weight_modified; + + ComputeChainObjfAndDeriv(opts, den_graph, supervision, + nnet_output_perturbed, + &objf_modified, &l2_term_modified, + &weight_modified, + NULL); + + observed_objf_changes(p) = objf_modified - objf; + } + KALDI_LOG << "Predicted objf changes are " << predicted_objf_changes; + KALDI_LOG << "Observed objf changes are " << observed_objf_changes; + { + Vector error(predicted_objf_changes); + error.AddVec(-1.0, observed_objf_changes); + KALDI_LOG << "num-sequences = " << num_sequences << ", frames-per-sequence = " + << frames_per_sequence << ", relative accuracy is " + << (error.Norm(2.0) / predicted_objf_changes.Norm(2.0)); + } + + { + // we get inaccuracy for long segments, I think because there is a bias when we + // add random noise for it to increase the likelihood (for winner-take-all reasons) + // and for long utterances this bias adds up over the frames and tends to + // outweigh the random component that the gradient predicts (which will tend to + // cancel). Try to correct for this... + BaseFloat correction = (predicted_objf_changes.Sum() - observed_objf_changes.Sum()) / + predicted_objf_changes.Dim(); + observed_objf_changes.Add(correction); + KALDI_LOG << "Correcting observed objf changes for statistical effects, to " + << observed_objf_changes; + if (frames_per_sequence > 2 && + predicted_objf_changes.Norm(2.0) > 0.1 * epsilon) { + // if we only have the initial and final frames, due to the scaling-down + // of pdfs not in the numerator sequence the derivative might be zero, + // which would cause problems doing the comparison. + // note, epsilon = 1.0e-04. + KALDI_ASSERT(predicted_objf_changes.ApproxEqual(observed_objf_changes, 0.25)); + } + } +} + +void PrintMatrix(const CuMatrixBase &mat) { + std::cerr << " [ "; + for (int32 i = 0; i < mat.NumRows(); i++) { + for (int32 j = 0; j < mat.NumCols(); j++) { + std::cerr << mat(i, j) << " "; + } + std::cerr << "\n"; + } + std::cerr << " ] "; +} + + +void ChainSmbrTrainingTest(const DenominatorGraph &den_graph, + const Supervision &supervision) { + int32 num_sequences = supervision.num_sequences, + frames_per_sequence = supervision.frames_per_sequence; + if (frames_per_sequence == 1) // this will break some code. + return; + + CuMatrix nnet_output(num_sequences * frames_per_sequence, + den_graph.NumPdfs()); + + bool zero_output = (RandInt(0, 3) == 0); + if (!zero_output) + nnet_output.SetRandn(); + + ChainTrainingOptions opts; + opts.leaky_hmm_coefficient = 0.0; + //if (RandInt(0, 1) == 1) + // opts.leaky_hmm_coefficient = 0.2; + + { + KALDI_LOG << "LF-MMI training"; + BaseFloat objf, l2_term, weight; + CuMatrix nnet_output_deriv(nnet_output.NumRows(), + nnet_output.NumCols(), + kUndefined); ComputeChainObjfAndDeriv(opts, den_graph, supervision, nnet_output, &objf, &l2_term, &weight, &nnet_output_deriv); - } else { - ComputeChainSmbrObjfAndDeriv(opts, den_graph, supervision, - nnet_output, &objf, &l2_term, &weight, - &nnet_output_deriv); } + + CuMatrix nnet_output_deriv(nnet_output.NumRows(), + nnet_output.NumCols(), + kUndefined); + KALDI_LOG << "LF-SMBR training"; + opts.use_smbr_objective = true; + BaseFloat objf, l2_term, weight; + ComputeChainSmbrObjfAndDeriv(opts, den_graph, supervision, + nnet_output, &objf, &l2_term, &weight, + &nnet_output_deriv); { // make sure each row of nnet_output_deriv sums to one (shift invariance of @@ -288,8 +410,6 @@ void ChainTrainingTest(const DenominatorGraph &den_graph, KALDI_ASSERT(output_deriv_sum < 0.2); } - KALDI_ASSERT(objf <= 0.0); - int32 num_tries = 5; BaseFloat epsilon = 1.0e-04; Vector predicted_objf_changes(num_tries), @@ -306,7 +426,7 @@ void ChainTrainingTest(const DenominatorGraph &den_graph, BaseFloat objf_modified, l2_term_modified, weight_modified; - ComputeChainObjfAndDeriv(opts, den_graph, supervision, + ComputeChainSmbrObjfAndDeriv(opts, den_graph, supervision, nnet_output_perturbed, &objf_modified, &l2_term_modified, &weight_modified, @@ -541,6 +661,75 @@ void ChainSupervisionTest() { delete trans_model; } +void ChainSupervisionSimpleTest() { + ContextDependency *ctx_dep; + TransitionModel *trans_model = GenRandTransitionModel(&ctx_dep); + const std::vector &phones = trans_model->GetPhones(); + + int32 subsample_factor = 3; + + int32 phone_sequence_length = 2; + std::vector > phones_durations(phone_sequence_length); + + CompactLattice clat; + int32 cur_state = clat.AddState(); + clat.SetStart(cur_state); + + for (int32 i = 0; i < phone_sequence_length; i++) { + int32 phone = phones[RandInt(0, phones.size() - 1)]; + int32 min_length = trans_model->GetTopo().MinLength(phone), + headroom = 5, + duration = RandInt(subsample_factor * min_length, + subsample_factor * min_length + headroom); + phones_durations[i].first = phone; + phones_durations[i].second = duration; + int32 next_state = clat.AddState(); + std::vector ones(duration, 1); + clat.AddArc(cur_state, + CompactLatticeArc(phone, phone, + CompactLatticeWeight(LatticeWeight::One(), + ones), next_state)); + cur_state = next_state; + } + clat.SetFinal(cur_state, CompactLatticeWeight::One()); + ProtoSupervision proto_sup1, proto_sup2; + SupervisionOptions opts; + opts.frame_subsampling_factor = subsample_factor; + bool ans1 = AlignmentToProtoSupervision(opts, phones_durations, &proto_sup1), + ans2 = PhoneLatticeToProtoSupervision(opts, clat, &proto_sup2); + KALDI_ASSERT(ans1 && ans2); + KALDI_ASSERT(proto_sup1 == proto_sup2); + + Supervision supervision; + if (!ProtoSupervisionToSupervision(*ctx_dep, *trans_model, + proto_sup1, &supervision)) { + // we shouldn't fail because we multiplied by + // 'subsample_factor' when creating the duration. + KALDI_ERR << "Failed creating supervision."; + } + supervision.Check(*trans_model); + TestSupervisionIo(supervision); + TestSupervisionSplitting(*ctx_dep, *trans_model, supervision); + //TestSupervisionAppend(*trans_model, supervision); + + { + fst::StdVectorFst den_fst; + ComputeExampleDenFst(*ctx_dep, *trans_model, &den_fst); + DenominatorGraph den_graph(den_fst, trans_model->NumPdfs()); + ChainDenominatorTest(den_graph); + fst::StdVectorFst normalization_fst; + den_graph.GetNormalizationFst(den_fst, &normalization_fst); + // add the weight to the numerator FST so we can assert objf <= 0. + bool ans = AddWeightToSupervisionFst(normalization_fst, &supervision); + KALDI_ASSERT(ans); + // TODO: still have to test for appended sequences. + ChainSmbrTrainingTest(den_graph, supervision); + } + + delete ctx_dep; + delete trans_model; +} + void AddArc(int32 from, int32 to, fst::StdVectorFst *fst) { fst->AddArc(from, fst::StdArc(0, 0, fst::TropicalWeight::One(), to)); @@ -612,21 +801,22 @@ void TestRanges() { int main() { using namespace kaldi; - SetVerboseLevel(1); + SetVerboseLevel(2); int32 loop = 0; #if HAVE_CUDA == 1 for (loop = 0; loop < 2; loop++) { CuDevice::Instantiate().SetDebugStrideMode(true); - if (loop == 0) + if (false && loop == 0) CuDevice::Instantiate().SelectGpuId("no"); else CuDevice::Instantiate().SelectGpuId("yes"); #endif for (int32 i = 0; i < 3; i++) { - kaldi::chain::ChainSupervisionTest(); - kaldi::chain::BreadthFirstTest(); + //kaldi::chain::ChainSupervisionTest(); + kaldi::chain::ChainSupervisionSimpleTest(); + //kaldi::chain::BreadthFirstTest(); } - kaldi::chain::TestRanges(); + //kaldi::chain::TestRanges(); #if HAVE_CUDA == 1 } CuDevice::Instantiate().PrintProfile(); diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc index 0a93bfd0f31..35edaa7373b 100644 --- a/src/cudamatrix/cu-matrix.cc +++ b/src/cudamatrix/cu-matrix.cc @@ -3156,7 +3156,6 @@ std::ostream &operator << (std::ostream &out, const CuMatrixBase &mat); template std::ostream &operator << (std::ostream &out, const CuMatrixBase &mat); - // Instantiate classes CuMatrix and CuMatrixBase for float and double. template class CuMatrix; template class CuMatrix; diff --git a/src/cudamatrix/cu-matrix.h b/src/cudamatrix/cu-matrix.h index e8823793cc3..b9f690e923c 100644 --- a/src/cudamatrix/cu-matrix.h +++ b/src/cudamatrix/cu-matrix.h @@ -582,6 +582,11 @@ class CuMatrixBase { void SetRandUniform(); void Write(std::ostream &os, bool binary) const; + inline std::string ToStr() const { + std::ostringstream oss; + oss << *this; + return oss.str(); + } // This function, adds a list of MatrixElements (scaled by alpha) to corresponding locations to // (*this). @@ -835,6 +840,8 @@ bool SameDimAndStride(const CuMatrixBase &M, const CuMatrixBase &N) template std::ostream &operator << (std::ostream &out, const CuMatrixBase &mat); +template +std::string ToStr(const CuMatrixBase &mat); template template diff --git a/src/cudamatrix/cu-vector.h b/src/cudamatrix/cu-vector.h index a72314b50f0..0dd8d906c53 100644 --- a/src/cudamatrix/cu-vector.h +++ b/src/cudamatrix/cu-vector.h @@ -282,6 +282,11 @@ class CuVector: public CuVectorBase { /// I/O void Read(std::istream &is, bool binary); void Write(std::ostream &is, bool binary) const; + inline std::string ToStr() const { + std::ostringstream oss; + oss << *this; + return oss.str(); + } void Swap(Vector *vec); diff --git a/src/matrix/kaldi-matrix.cc b/src/matrix/kaldi-matrix.cc index 50c23a7be63..b3e98d579f4 100644 --- a/src/matrix/kaldi-matrix.cc +++ b/src/matrix/kaldi-matrix.cc @@ -1267,6 +1267,23 @@ void MatrixBase::Write(std::ostream &os, bool binary) const { } } +template +std::string MatrixBase::ToStr() const { + std::ostringstream oss; + if (num_cols_ == 0) { + oss << " [ ]\n"; + } else { + oss << " ["; + for (MatrixIndexT i = 0; i < num_rows_; i++) { + oss << "\n "; + for (MatrixIndexT j = 0; j < num_cols_; j++) + oss << (*this)(i, j) << " "; + } + oss << "]\n"; + } + return oss.str(); +} + template void MatrixBase::Read(std::istream & is, bool binary, bool add) { diff --git a/src/matrix/kaldi-matrix.h b/src/matrix/kaldi-matrix.h index 25b999fe062..00ef3ad0a67 100644 --- a/src/matrix/kaldi-matrix.h +++ b/src/matrix/kaldi-matrix.h @@ -684,6 +684,8 @@ class MatrixBase { /// write to stream. void Write(std::ostream & out, bool binary) const; + std::string ToStr() const; + // Below is internal methods for Svd, user does not have to know about this. #if !defined(HAVE_ATLAS) && !defined(USE_KALDI_SVD) // protected: From 06826181031448de73b620dd40e25ebed6853f16 Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Sat, 24 Jun 2017 19:09:43 -0400 Subject: [PATCH 018/174] smbr without leaky --- src/chain/chain-denominator-smbr.cc | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/src/chain/chain-denominator-smbr.cc b/src/chain/chain-denominator-smbr.cc index c83d42daa0e..0e830a1d5d7 100644 --- a/src/chain/chain-denominator-smbr.cc +++ b/src/chain/chain-denominator-smbr.cc @@ -192,8 +192,13 @@ void DenominatorSmbrComputation::AlphaSmbrGeneralFrame(int32 t) { KALDI_ASSERT(this_tot_alpha - this_tot_alpha == 0); KALDI_ASSERT(this_tot_alpha_smbr - this_tot_alpha_smbr == 0); this_alpha[h * num_sequences + s] = this_tot_alpha * arbitrary_scale; - this_alpha_smbr[h * num_sequences + s] = - this_tot_alpha_smbr * arbitrary_scale; + //this_alpha_smbr[h * num_sequences + s] = + // this_tot_alpha_smbr * arbitrary_scale; + if (this_tot_alpha > 0.0) + this_alpha_smbr[h * num_sequences + s] = + this_tot_alpha_smbr / this_tot_alpha; + else + this_alpha_smbr[h * num_sequences + s] = 0.0; } } } @@ -225,7 +230,7 @@ void DenominatorSmbrComputation::AlphaSmbrDash(int32 t) { den_graph_.InitialProbs(), alpha_sum_vec); // it's now alpha-dash. - alpha_smbr_.Row(t).DivElements(alpha_.Row(t)); + //alpha_smbr_.Row(t).DivElements(alpha_.Row(t)); } // compute beta from beta-dash. @@ -249,7 +254,7 @@ void DenominatorSmbrComputation::BetaSmbr(int32 t) { // will contain the actual beta (i.e. the counterpart of alpha), // not the beta-dash. beta_dash_mat.AddVecToRows(1.0, beta_dash_sum_vec); - beta_smbr_.Row(t % 2).DivElements(beta_.Row(t % 2)); + //beta_smbr_.Row(t % 2).DivElements(beta_.Row(t % 2)); } BaseFloat DenominatorSmbrComputation::ForwardSmbr() { @@ -457,8 +462,13 @@ void DenominatorSmbrComputation::BetaSmbrGeneralFrame(int32 t) { } this_beta_dash[h * num_sequences + s] = tot_variable_factor / inv_arbitrary_scale; - this_beta_smbr[h * num_sequences + s] = - tot_beta_smbr / inv_arbitrary_scale; + //this_beta_smbr[h * num_sequences + s] = + // tot_beta_smbr / inv_arbitrary_scale; + if (tot_variable_factor > 0.0) + this_beta_smbr[h * num_sequences + s] = + tot_beta_smbr / tot_variable_factor; + else + this_beta_smbr[h * num_sequences + s] = 0.0; } } } @@ -492,8 +502,8 @@ void DenominatorSmbrComputation::BetaSmbrGeneralFrameDebug(int32 t) { // alpha_smbr_vec is a vector of size 'num_hmm_states' * 'num_sequences_' CuVector alpha_beta_smbr_vec(this_beta_smbr); - alpha_beta_smbr_vec.DivElements(this_beta_dash); - alpha_beta_smbr_vec.AddVec(1.0, this_beta_smbr, 1.0); + //alpha_beta_smbr_vec.DivElements(this_beta_dash); + alpha_beta_smbr_vec.AddVec(1.0, this_alpha_smbr, 1.0); CuVector alpha_beta_vec(this_alpha_dash); alpha_beta_vec.MulElements(this_beta_dash); From 62da39acf35bf5577c1f35147b44b33a81ab6970 Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Tue, 27 Jun 2017 17:42:31 -0400 Subject: [PATCH 019/174] chain-smbr: Fix bugs in chain smbr --- .../nnet3/train/chain_objf/acoustic_model.py | 2 +- src/chain/chain-denominator-smbr.cc | 106 +++++++++++------- src/chain/chain-denominator-smbr.h | 5 +- src/chain/chain-denominator.cc | 4 +- src/chain/chain-supervision-test.cc | 8 +- 5 files changed, 72 insertions(+), 53 deletions(-) diff --git a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py index 72a8cdd0f1c..302e99d0d73 100644 --- a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py +++ b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py @@ -215,7 +215,7 @@ def train_new_models(dir, iter, srand, num_jobs, buf_size=shuffle_buffer_size, num_chunk_per_mb=num_chunk_per_minibatch_str, smbr_opt="--use-smbr-objective" - if use_smbr_objective is not None else ""), + if use_smbr_objective else ""), require_zero_status=True) threads.append(thread) diff --git a/src/chain/chain-denominator-smbr.cc b/src/chain/chain-denominator-smbr.cc index 0e830a1d5d7..607be64decf 100644 --- a/src/chain/chain-denominator-smbr.cc +++ b/src/chain/chain-denominator-smbr.cc @@ -45,12 +45,14 @@ DenominatorSmbrComputation::DenominatorSmbrComputation( den_graph_.NumStates() * num_sequences_ + num_sequences_, kUndefined), alpha_smbr_(frames_per_sequence_ + 1, - den_graph_.NumStates() * num_sequences_ + num_sequences_), + den_graph_.NumStates() * num_sequences_ + num_sequences_, + kUndefined), beta_(2, den_graph_.NumStates() * num_sequences_ + num_sequences_, kUndefined), - beta_smbr_(2, den_graph_.NumStates() * num_sequences_ + num_sequences_), + beta_smbr_(2, den_graph_.NumStates() * num_sequences_ + num_sequences_, + kUndefined), tot_prob_(num_sequences_, kUndefined), - tot_smbr_(num_sequences_), + tot_smbr_(num_sequences_, kUndefined), ok_(true) { KALDI_ASSERT(opts_.leaky_hmm_coefficient >= 0.0 && opts_.leaky_hmm_coefficient < 1.0); @@ -69,7 +71,7 @@ DenominatorSmbrComputation::DenominatorSmbrComputation( } -void DenominatorSmbrComputation::AlphaFirstFrame() { +void DenominatorSmbrComputation::AlphaSmbrFirstFrame() { // dim == num_hmm_states_ * num_sequences_. BaseFloat *first_frame_alpha = alpha_.RowData(0); // create a 'fake matrix' - view this row as a matrix. @@ -82,19 +84,14 @@ void DenominatorSmbrComputation::AlphaFirstFrame() { // CopyColsFromVec function in class CuMatrix. alpha_mat.SetZero(); alpha_mat.AddVecToCols(1.0, den_graph_.InitialProbs(), 0.0); -} - -void DenominatorSmbrComputation::AlphaSmbrFirstFrame() { - // dim == num_hmm_states_ * num_sequences_. BaseFloat *first_frame_alpha_smbr = alpha_smbr_.RowData(0); // create a 'fake matrix' - view this row as a matrix. // initializer takes [pointer, num-rows, num-cols, stride]. - CuSubMatrix alpha_smbr_mat(first_frame_alpha_smbr, - den_graph_.NumStates(), - num_sequences_, - num_sequences_); - alpha_smbr_mat.SetZero(); + CuSubVector alpha_smbr_vec(first_frame_alpha_smbr, + den_graph_.NumStates() + * num_sequences_); + alpha_smbr_vec.SetZero(); } @@ -187,13 +184,11 @@ void DenominatorSmbrComputation::AlphaSmbrGeneralFrame(int32 t) { // a good numeric range. This won't affect the posteriors, but when // computing the total likelihood we'll need to compensate for it later // on. - BaseFloat arbitrary_scale = + BaseFloat arbitrary_scale = 1.0 / prev_alpha_dash[num_hmm_states * num_sequences + s]; KALDI_ASSERT(this_tot_alpha - this_tot_alpha == 0); KALDI_ASSERT(this_tot_alpha_smbr - this_tot_alpha_smbr == 0); this_alpha[h * num_sequences + s] = this_tot_alpha * arbitrary_scale; - //this_alpha_smbr[h * num_sequences + s] = - // this_tot_alpha_smbr * arbitrary_scale; if (this_tot_alpha > 0.0) this_alpha_smbr[h * num_sequences + s] = this_tot_alpha_smbr / this_tot_alpha; @@ -206,6 +201,7 @@ void DenominatorSmbrComputation::AlphaSmbrGeneralFrame(int32 t) { void DenominatorSmbrComputation::AlphaSmbrDash(int32 t) { BaseFloat *this_alpha = alpha_.RowData(t); + BaseFloat *this_alpha_smbr = alpha_smbr_.RowData(t); // create a 'fake matrix' for the regular alphas- view this row as a matrix. // initializer takes [pointer, num-rows, num-cols, stride]. @@ -214,6 +210,12 @@ void DenominatorSmbrComputation::AlphaSmbrDash(int32 t) { num_sequences_, num_sequences_); + CuSubMatrix alpha_smbr_mat(this_alpha_smbr, + den_graph_.NumStates(), + num_sequences_, + num_sequences_); + alpha_smbr_mat.MulElements(alpha_mat); + // Compute the sum of alpha over all states i for the current time. // This is done for each sequence and stored in the last 'num_sequences_' // columns. @@ -221,21 +223,31 @@ void DenominatorSmbrComputation::AlphaSmbrDash(int32 t) { den_graph_.NumStates() * num_sequences_, num_sequences_); alpha_sum_vec.AddRowSumMat(1.0, alpha_mat, 0.0); + + CuSubVector alpha_smbr_sum_vec( + this_alpha_smbr + den_graph_.NumStates() * num_sequences_, + num_sequences_); + alpha_smbr_sum_vec.AddRowSumMat(1.0, alpha_smbr_mat, 0.0); BaseFloat alpha_sum = alpha_sum_vec.Sum(); KALDI_VLOG(2) << "alpha-sum for time " << t << " is " << alpha_sum; KALDI_ASSERT(alpha_sum_vec.Min() > 0); + alpha_smbr_mat.AddVecVec(opts_.leaky_hmm_coefficient, + den_graph_.InitialProbs(), + alpha_smbr_sum_vec); alpha_mat.AddVecVec(opts_.leaky_hmm_coefficient, den_graph_.InitialProbs(), alpha_sum_vec); // it's now alpha-dash. - //alpha_smbr_.Row(t).DivElements(alpha_.Row(t)); + + alpha_smbr_mat.DivElements(alpha_mat); } // compute beta from beta-dash. void DenominatorSmbrComputation::BetaSmbr(int32 t) { BaseFloat *this_beta_dash = beta_.RowData(t % 2); + BaseFloat *this_beta_smbr_dash = beta_smbr_.RowData(t % 2); // create a 'fake matrix' for the regular beta-dash (which is // the counterpart of alpha-dash)- view this row as a matrix. // initializer takes [pointer, num-rows, num-cols, stride]. @@ -243,6 +255,13 @@ void DenominatorSmbrComputation::BetaSmbr(int32 t) { den_graph_.NumStates(), num_sequences_, num_sequences_); + + CuSubMatrix beta_smbr_dash_mat(this_beta_smbr_dash, + den_graph_.NumStates(), + num_sequences_, + num_sequences_); + beta_smbr_dash_mat.MulElements(beta_dash_mat); + // making the t index implicit, the beta-dash-sum for each sequence is the sum // over all states i of beta_i * opts_.leaky_hmm_coefficient * initial_prob_i. CuSubVector beta_dash_sum_vec( @@ -250,15 +269,23 @@ void DenominatorSmbrComputation::BetaSmbr(int32 t) { num_sequences_); beta_dash_sum_vec.AddMatVec(opts_.leaky_hmm_coefficient, beta_dash_mat, kTrans, den_graph_.InitialProbs(), 0.0); + CuSubVector beta_smbr_dash_sum_vec( + this_beta_smbr_dash + den_graph_.NumStates() * num_sequences_, + num_sequences_); + beta_smbr_dash_sum_vec.AddMatVec(opts_.leaky_hmm_coefficient, + beta_smbr_dash_mat, kTrans, + den_graph_.InitialProbs(), 0.0); + // we are computing beta in place. After the following, beta-dash-mat // will contain the actual beta (i.e. the counterpart of alpha), // not the beta-dash. beta_dash_mat.AddVecToRows(1.0, beta_dash_sum_vec); - //beta_smbr_.Row(t % 2).DivElements(beta_.Row(t % 2)); + + beta_smbr_dash_mat.AddVecToRows(1.0, beta_smbr_dash_sum_vec); + beta_smbr_dash_mat.DivElements(beta_dash_mat); } BaseFloat DenominatorSmbrComputation::ForwardSmbr() { - AlphaFirstFrame(); AlphaSmbrFirstFrame(); AlphaSmbrDash(0); for (int32 t = 1; t <= frames_per_sequence_; t++) { @@ -304,7 +331,7 @@ BaseFloat DenominatorSmbrComputation::ComputeTotObjf() { bool DenominatorSmbrComputation::BackwardSmbr( BaseFloat deriv_weight, CuMatrixBase *nnet_output_deriv) { - BetaDashLastFrame(); + BetaSmbrDashLastFrame(); BetaSmbr(frames_per_sequence_); for (int32 t = frames_per_sequence_ - 1; t >= 0; t--) { BetaSmbrGeneralFrame(t); @@ -333,7 +360,7 @@ bool DenominatorSmbrComputation::BackwardSmbr( return ok_; } -void DenominatorSmbrComputation::BetaDashLastFrame() { +void DenominatorSmbrComputation::BetaSmbrDashLastFrame() { // sets up the beta-dash quantity on the last frame (frame == // frames_per_sequence_). Note that the betas we use here contain a // 1/(tot-prob) factor in order to simplify the backprop. @@ -351,15 +378,13 @@ void DenominatorSmbrComputation::BetaDashLastFrame() { // the beta values at the end of the file only vary with the sequence-index, // not with the HMM-index. We treat all states as having a final-prob of one. beta_dash_mat.CopyRowsFromVec(inv_tot_prob); -} - -void DenominatorSmbrComputation::BetaSmbrLastFrame() { - // sets up the beta-dash quantity on the last frame (frame == - // frames_per_sequence_). Note that the betas we use here contain a - // 1/(tot-prob) factor in order to simplify the backprop. + + BaseFloat *last_frame_beta_smbr_dash = beta_smbr_.RowData(t % 2); - int32 t = frames_per_sequence_; - beta_smbr_.Row(t % 2).SetZero(); + CuSubVector beta_smbr_dash_vec(last_frame_beta_smbr_dash, + den_graph_.NumStates() + * num_sequences_); + beta_smbr_dash_vec.SetZero(); } void DenominatorSmbrComputation::BetaSmbrGeneralFrame(int32 t) { @@ -462,11 +487,9 @@ void DenominatorSmbrComputation::BetaSmbrGeneralFrame(int32 t) { } this_beta_dash[h * num_sequences + s] = tot_variable_factor / inv_arbitrary_scale; - //this_beta_smbr[h * num_sequences + s] = - // tot_beta_smbr / inv_arbitrary_scale; - if (tot_variable_factor > 0.0) + if (tot_variable_factor > 0.0) this_beta_smbr[h * num_sequences + s] = - tot_beta_smbr / tot_variable_factor; + tot_beta_smbr / tot_variable_factor; else this_beta_smbr[h * num_sequences + s] = 0.0; } @@ -502,7 +525,6 @@ void DenominatorSmbrComputation::BetaSmbrGeneralFrameDebug(int32 t) { // alpha_smbr_vec is a vector of size 'num_hmm_states' * 'num_sequences_' CuVector alpha_beta_smbr_vec(this_beta_smbr); - //alpha_beta_smbr_vec.DivElements(this_beta_dash); alpha_beta_smbr_vec.AddVec(1.0, this_alpha_smbr, 1.0); CuVector alpha_beta_vec(this_alpha_dash); @@ -524,14 +546,14 @@ void DenominatorSmbrComputation::BetaSmbrGeneralFrameDebug(int32 t) { // / alpha_beta_product; // use higher tolerance, since we are using randomized pruning for the // log-prob derivatives. - ///if (!ApproxEqual(this_log_prob_deriv_sum, 0, 0.01)) { - /// KALDI_WARN << "On time " << t << ", log-prob-deriv sum " - /// << this_log_prob_deriv_sum << " != " << 0; - /// if (fabs(this_log_prob_deriv_sum - 0) > 2.0) { - /// KALDI_WARN << "Excessive error detected, will abandon this minibatch"; - /// ok_ = false; - /// } - ///} + if (GetVerboseLevel() > 1 || !ApproxEqual(this_log_prob_deriv_sum, 0, 0.01)) { + KALDI_WARN << "On time " << t << ", log-prob-deriv sum " + << this_log_prob_deriv_sum << " != " << 0; + if (fabs(this_log_prob_deriv_sum - 0) > 2.0) { + KALDI_WARN << "Excessive error detected, will abandon this minibatch"; + ok_ = false; + } + } } diff --git a/src/chain/chain-denominator-smbr.h b/src/chain/chain-denominator-smbr.h index 0fd52cf178d..a4c22ad4fd2 100644 --- a/src/chain/chain-denominator-smbr.h +++ b/src/chain/chain-denominator-smbr.h @@ -247,8 +247,6 @@ class DenominatorSmbrComputation { // setting it small is that we have to invoke an AddMat kernel more times. enum { kMaxDerivTimeSteps = 8 }; - // sets up the alpha for frame t = 0. - void AlphaFirstFrame(); // sets up the alpha for frame t = 0. void AlphaSmbrFirstFrame(); // the alpha computation for some 0 < t <= num_time_steps_. @@ -264,8 +262,7 @@ class DenominatorSmbrComputation { // from the ForwardSmbr() computation). BaseFloat ComputeTotObjf(); - void BetaDashLastFrame(); - void BetaSmbrLastFrame(); + void BetaSmbrDashLastFrame(); // beta computation for 0 <= beta < num_time_steps_. void BetaSmbrGeneralFrame(int32 t); // compute the beta quantity from the beta-dash quantity (relates to leaky hmm). diff --git a/src/chain/chain-denominator.cc b/src/chain/chain-denominator.cc index 1e2e418991c..3a9c350bbfe 100644 --- a/src/chain/chain-denominator.cc +++ b/src/chain/chain-denominator.cc @@ -48,7 +48,7 @@ DenominatorComputation::DenominatorComputation( tot_log_prob_(num_sequences_, kUndefined), log_correction_term_(num_sequences_, kUndefined), ok_(true) { - KALDI_ASSERT(opts_.leaky_hmm_coefficient >= 0.0 && + KALDI_ASSERT(opts_.leaky_hmm_coefficient > 0.0 && opts_.leaky_hmm_coefficient < 1.0); // make sure the alpha sums and beta sums are zeroed. alpha_.ColRange(den_graph_.NumStates() * num_sequences_, @@ -401,7 +401,7 @@ void DenominatorComputation::BetaGeneralFrameDebug(int32 t) { BaseFloat alpha_beta_product = VecVec(this_alpha_dash, this_beta_dash), this_log_prob_deriv_sum = this_log_prob_deriv.Sum(); - if (true || GetVerboseLevel() > 1 || !ApproxEqual(alpha_beta_product, num_sequences_)) { + if (GetVerboseLevel() > 1 || !ApproxEqual(alpha_beta_product, num_sequences_)) { KALDI_WARN << "On time " << t << ", alpha-beta product " << alpha_beta_product << " != " << num_sequences_ << " alpha-dash-sum = " << this_alpha_dash.Sum() diff --git a/src/chain/chain-supervision-test.cc b/src/chain/chain-supervision-test.cc index da56ab3152d..7674f50d040 100644 --- a/src/chain/chain-supervision-test.cc +++ b/src/chain/chain-supervision-test.cc @@ -367,9 +367,9 @@ void ChainSmbrTrainingTest(const DenominatorGraph &den_graph, nnet_output.SetRandn(); ChainTrainingOptions opts; - opts.leaky_hmm_coefficient = 0.0; - //if (RandInt(0, 1) == 1) - // opts.leaky_hmm_coefficient = 0.2; + if (RandInt(0, 1) == 1) + opts.leaky_hmm_coefficient = 0.2; + opts.leaky_hmm_coefficient = 0.1; { KALDI_LOG << "LF-MMI training"; @@ -806,7 +806,7 @@ int main() { #if HAVE_CUDA == 1 for (loop = 0; loop < 2; loop++) { CuDevice::Instantiate().SetDebugStrideMode(true); - if (false && loop == 0) + if (loop == 0) CuDevice::Instantiate().SelectGpuId("no"); else CuDevice::Instantiate().SelectGpuId("yes"); From 5b7879d150e12550b1bc1c0842fdbce9cc890022 Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Tue, 27 Jun 2017 17:43:21 -0400 Subject: [PATCH 020/174] smbr training --- egs/wsj/s5/steps/nnet3/chain/train.py | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/egs/wsj/s5/steps/nnet3/chain/train.py b/egs/wsj/s5/steps/nnet3/chain/train.py index e599c981f94..9c35b320428 100755 --- a/egs/wsj/s5/steps/nnet3/chain/train.py +++ b/egs/wsj/s5/steps/nnet3/chain/train.py @@ -99,6 +99,9 @@ def get_args(): dest='left_deriv_truncate', default=None, help="Deprecated. Kept for back compatibility") + parser.add_argument("--chain.smbr-start-fraction", type=float, + dest='smbr_start_fraction', default=1.1, + help="Fraction of training at which to start LF-SMBR") # trainer options parser.add_argument("--trainer.num-epochs", type=float, dest='num_epochs', @@ -445,6 +448,11 @@ def train(args, run_opts): args.shrink_saturation_threshold) else shrinkage_value) + use_smbr=False + if (float(num_archives_processed) / num_archives_to_process + > args.smbr_start_fraction): + use_smbr=True + chain_lib.train_one_iteration( dir=args.dir, iter=iter, @@ -472,7 +480,8 @@ def train(args, run_opts): frame_subsampling_factor=args.frame_subsampling_factor, run_opts=run_opts, backstitch_training_scale=args.backstitch_training_scale, - backstitch_training_interval=args.backstitch_training_interval) + backstitch_training_interval=args.backstitch_training_interval, + use_smbr_objective=use_smbr) if args.cleanup: # do a clean up everythin but the last 2 models, under certain @@ -497,6 +506,11 @@ def train(args, run_opts): if args.stage <= num_iters: logger.info("Doing final combination to produce final.mdl") + + use_smbr=False + if (float(num_archives_processed) / num_archives_to_process + > args.smbr_start_fraction): + use_smbr=True chain_lib.combine_models( dir=args.dir, num_iters=num_iters, models_to_combine=models_to_combine, @@ -506,8 +520,8 @@ def train(args, run_opts): l2_regularize=args.l2_regularize, xent_regularize=args.xent_regularize, run_opts=run_opts, - sum_to_one_penalty=args.combine_sum_to_one_penalty) - + sum_to_one_penalty=args.combine_sum_to_one_penalty, + use_smbr_objective=use_smbr) if args.cleanup: logger.info("Cleaning up the experiment directory " From a9736328d3218bb198d7e114763dd73309d843a6 Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Thu, 29 Jun 2017 01:27:48 -0400 Subject: [PATCH 021/174] Adding missing chain-smbr-kernels.cu --- src/chain/chain-smbr-kernels.cu | 353 ++++++++++++++++++++++++++++++++ 1 file changed, 353 insertions(+) create mode 100644 src/chain/chain-smbr-kernels.cu diff --git a/src/chain/chain-smbr-kernels.cu b/src/chain/chain-smbr-kernels.cu new file mode 100644 index 00000000000..7aa33c4154b --- /dev/null +++ b/src/chain/chain-smbr-kernels.cu @@ -0,0 +1,353 @@ +// chain/chain-kernels.cu + +// Copyright 2015 Johns Hopkins University (author: Daniel Povey) + + +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + + +#include +#include "chain/chain-kernels-ansi.h" + +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 200 +#error - Kaldi no longer supports CC1.x devices. Please use a newer GPU or \ + configure with --use-cuda=no (this will disable the use of GPU). +#endif + + +#ifdef __CUDACC__ +#if ( __CUDACC_VER_MAJOR__ >= 8 ) && ( !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600 ) +// native implementation available +#else +#if __CUDA_ARCH__ >= 600 +#error using CAS implementation of double atomicAdd +#endif +__device__ double atomicAdd(double* address, double val) { + unsigned long long int* address_as_ull = (unsigned long long int*) address; + unsigned long long int old = *address_as_ull, assumed; + + do { + assumed = old; + old = atomicCAS(address_as_ull, assumed, + __double_as_longlong(val + __longlong_as_double(assumed))); + + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + + return __longlong_as_double(old); +} +#endif +#endif + + +template +__device__ inline void atomic_add(Real* address, Real value) { + atomicAdd(address, value); +} + +template +__device__ inline void atomic_add_thresholded(Real* address, Real value) { + // This function uses a randomized algorithm to only do atomic adds for values + // with absolute value >= a threshold, + // and if it's below the threshold, randomly add the + // threshold itself with probability (value / threshold). This preserves + // expectations. + + // kThresholdingPowerOfTwo is defined in chain-datastruct.h; it defines + // the threshold for randomized posterior pruning. + const Real threshold = 1.0 / (1 << kThresholdingPowerOfTwo); + Real abs_value = abs(value); + if (abs_value >= threshold) { + atomic_add(address, value); + } else { + // The intention here is to do: + // with probability(value / threshold), do: + // atomic_add(address, threshold); + // We use the least significant bits of the value as a source of + // randomness. It would probably be more efficient to extract these + // random bits directly from the float, but I don't want to have to + // deal with endian-ness issues. + // + // below, x is a fixed-point representation of (value / threshold); it would + // be 16777216 == 2^24 if value == threshold and 0 if value == 0. We choose + // the power 24 because that's the number of binary digits in the mantissa + // in IEEE single precision floating point. + // Note: we parenthesize the expression like this so that the + // denominator can be precomputed as a constant expression. + int32_cuda x = abs_value / (threshold / (1 << 24)); + // in the line below, the expression (x >> 12) is a representation of (value / + // threshold) between 0 and 4096, with 4096 representing (value / threshold == + // 1), while (x & 4095) is treated as a pseudorandom number between 0 and 4095. + if ((x >> 12) > (x & 4095)) { + if (value >= 0) atomic_add(address, threshold); + else atomic_add(address, -threshold); + } + } +} + +// one iteration of the forward computation in the chain HMM with +// SMBR objective. +// The grid y determines which HMM-state we handle. [put this in the grid because +// HMM-states don't all take the same amount of time in the backwards direction, and it's +// better for scheduling to have them at the outer level.] +// The block x and grid x determine which sequence (0 ... num_sequences - 1) we handle; +// note that num_sequences == the number of elements in the minibatch, and we +// insist they all have the same number of time steps. +// note: 'probs' is indexed by sequence-index + (pdf-index * prob_stride). +// note: 'num_post' is indexed by sequence-index + (pdf-index * post_stride). +__global__ +static void _cuda_chain_smbr_hmm_forward( + const Int32Pair *backward_transitions, + const DenominatorGraphTransition *transitions, + int32_cuda num_sequences, + int32_cuda num_hmm_states, + const BaseFloat *probs, int32_cuda prob_stride, + const BaseFloat *num_post, int32_cuda post_stride, + const BaseFloat *prev_alpha, const BaseFloat *prev_alpha_smbr, + BaseFloat *this_alpha, BaseFloat *this_alpha_smbr) { + // 'backward_transitions', indexed by hmm-state, consists of [start, end] + // indexes into the 'transitions' array. This gives us the info for + // transitions *into* this state. 'probs' contains the exponentiated neural + // net outputs; it has dimension num-output-indexes by num_sequences and its + // stride is 'prob_stride'. 'prev_alpha' and 'this_alpha', which are + // extracted from a larger matrix, both have dimension num-history-states by + // num-sequences. 'prev_alpha_smbr' and 'this_alpha_smbr' are analogous + // for the partial SMBR values. + + // s is the index of the sequence within the minibatch, + // from 0 .. num-egs-in-this-minibatch - 1. + // h is the hmm-state index. + int32_cuda s = threadIdx.x + blockIdx.x * blockDim.x, + h = blockIdx.y; + if (s >= num_sequences) + return; + + double this_tot_alpha = 0.0, this_tot_alpha_smbr = 0.0; + const DenominatorGraphTransition + *trans_iter = transitions + backward_transitions[h].first, + *trans_end = transitions + backward_transitions[h].second; + // Note: regarding this loop unrolling, I tried the automatic unrolling using + // #pragma unroll 2 (after modifying the loop to have an integer index), but I + // did not see any performance improvement, it was slightly slower. So the + // compiler must be doing something different than what I'm doing here. + const int loop_unroll = 2; // don't change this without changing the code + // below. + for (; trans_iter + loop_unroll <= trans_end; trans_iter += loop_unroll) { + BaseFloat transition_prob0 = trans_iter[0].transition_prob; + int32_cuda pdf_id0 = trans_iter[0].pdf_id, + prev_hmm_state0 = trans_iter[0].hmm_state; + BaseFloat transition_prob1 = trans_iter[1].transition_prob; + int32_cuda pdf_id1 = trans_iter[1].pdf_id, + prev_hmm_state1 = trans_iter[1].hmm_state; + BaseFloat pseudo_loglike0 = probs[pdf_id0 * prob_stride + s], + num_post0 = num_post[pdf_id0 * post_stride + s], + this_prev_alpha0 = prev_alpha[prev_hmm_state0 * num_sequences + s], + this_prev_alpha_smbr0 = + prev_alpha_smbr[prev_hmm_state0 * num_sequences + s], + pseudo_loglike1 = probs[pdf_id1 * prob_stride + s], + num_post1 = num_post[pdf_id1 * post_stride + s], + this_prev_alpha1 = prev_alpha[prev_hmm_state1 * num_sequences + s], + this_prev_alpha_smbr1 = + prev_alpha_smbr[prev_hmm_state1 * num_sequences + s]; + + this_tot_alpha += this_prev_alpha0 * transition_prob0 * pseudo_loglike0 + + this_prev_alpha1 * transition_prob1 * pseudo_loglike1; + this_tot_alpha_smbr += + (this_prev_alpha_smbr0 + num_post0) * this_prev_alpha0 + * transition_prob0 * pseudo_loglike0 + + (this_prev_alpha_smbr1 + num_post1) * this_prev_alpha1 + * transition_prob1 * pseudo_loglike1; + } + if (trans_iter != trans_end) { + // mop up the odd transition. + BaseFloat transition_prob0 = trans_iter[0].transition_prob; + int32_cuda pdf_id0 = trans_iter[0].pdf_id, + prev_hmm_state0 = trans_iter[0].hmm_state; + BaseFloat pseudo_loglike0 = probs[pdf_id0 * prob_stride + s], + num_post0 = num_post[pdf_id0 * post_stride + s], + this_prev_alpha0 = prev_alpha[prev_hmm_state0 * num_sequences + s], + this_prev_alpha_smbr0 = + prev_alpha_smbr[prev_hmm_state0 * num_sequences + s]; + this_tot_alpha += this_prev_alpha0 * transition_prob0 * pseudo_loglike0; + this_tot_alpha_smbr += + (this_prev_alpha_smbr0 + num_post0) * this_prev_alpha0 + * transition_prob0 * pseudo_loglike0; + } + + // Let arbitrary_scale be the inverse of the sum of all alpha values on-- the + // previous frame this sum of all the alpha values is stored in the place that + // we'd store the previous alpha for state-index equal to num_hmm_states + // (i.e. one past the end). We multiply this into all the + // transition-probabilities from the previous frame to this frame, in both the + // forward and backward passes, in order to keep the alphas in a good numeric + // range. This won't affect the posteriors, as it's just a constant factor + // for each frame, but when computing the total likelihood we'll need to + // compensate for it later on. + BaseFloat arbitrary_scale = + 1.0 / prev_alpha[num_hmm_states * num_sequences + s]; + this_alpha[h * num_sequences + s] = this_tot_alpha * arbitrary_scale; + if (this_tot_alpha > 0.0) + this_alpha_smbr[h * num_sequences + s] = + this_tot_alpha_smbr / this_tot_alpha; + else + this_alpha_smbr[h * num_sequences + s] = 0.0; +} + + +__global__ +static void _cuda_chain_smbr_hmm_backward( + const Int32Pair *forward_transitions, + const DenominatorGraphTransition *transitions, + int32_cuda num_sequences, int32_cuda num_hmm_states, + const BaseFloat *probs, int32_cuda prob_stride, + const BaseFloat *num_post, int32_cuda post_stride, + const BaseFloat *tot_smbr, + const BaseFloat *this_alpha, const BaseFloat *this_alpha_smbr, + const BaseFloat *next_beta, const BaseFloat *next_beta_smbr, + BaseFloat *this_beta, BaseFloat *this_beta_smbr, + BaseFloat *log_prob_deriv, int32_cuda log_prob_deriv_stride) { + // 'forward_transitions', indexed by hmm-state, consists of [start, end] + // indexes into the 'transition_info' array. This is about the transitions + // *out of* this state. 'probs' contains the exponentiated neural net + // outputs; it has dimension num-output-indexes by num_sequences, and contains + // just the observation probabilities for this time index. Its stride is + // prob_stride. + // 'this_alpha', 'next_beta' and 'this_beta' all have dimension + // num-history-states by num-sequences. + // 'this_alpha_smbr', 'next_beta_smbr', and 'this_beta_smbr' are + // analogous quantities storing values for SMBR objective. + // The beta probs are normalized in such a way (by multiplying by 1/(total-data-prob)) + // that to get occupation counts we don't need to multiply by 1/total-data-prob. + // deriv_scale is a factor (e.g. -1.0 or -0.99) that we multiply these derivs by + // while accumulating them. + + // s is the index of the sequence within the minibatch, + // from 0 .. num-egs-in-this-minibatch - 1. + // h is the hmm-state index. + int32_cuda s = threadIdx.x + blockIdx.x * blockDim.x, + h = blockIdx.y; + if (s >= num_sequences) + return; + + // See where arbitrary_scale is defined in the forward computation above, for + // more explanation of inv_arbitrary_scale. + BaseFloat this_alpha_prob = this_alpha[h * num_sequences + s], + this_alpha_smbr_i = this_alpha_smbr[h * num_sequences + s], + inv_arbitrary_scale = + this_alpha[num_hmm_states * num_sequences + s]; + double tot_variable_factor = 0.0, tot_beta_smbr = 0.0; + + BaseFloat occupation_factor = this_alpha_prob / inv_arbitrary_scale; + const DenominatorGraphTransition + *trans_iter = transitions + forward_transitions[h].first, + *trans_end = transitions + forward_transitions[h].second; + const int loop_unroll = 2; // don't change this without changing the code + // below. + for (; trans_iter + loop_unroll <= trans_end; trans_iter += loop_unroll) { + BaseFloat transition_prob0 = trans_iter[0].transition_prob; + int32_cuda pdf_id0 = trans_iter[0].pdf_id, + next_hmm_state0 = trans_iter[0].hmm_state; + BaseFloat transition_prob1 = trans_iter[1].transition_prob; + int32_cuda pdf_id1 = trans_iter[1].pdf_id, + next_hmm_state1 = trans_iter[1].hmm_state; + BaseFloat next_beta_j0 = next_beta[next_hmm_state0 * num_sequences + s], + next_beta_smbr_j0 = next_beta_smbr[next_hmm_state0 * num_sequences + s], + next_beta_j1 = next_beta[next_hmm_state1 * num_sequences + s], + next_beta_smbr_j1 = next_beta_smbr[next_hmm_state1 * num_sequences + s], + prob0 = probs[pdf_id0 * prob_stride + s], + prob1 = probs[pdf_id1 * prob_stride + s], + num_post0 = num_post[pdf_id0 * post_stride + s], + num_post1 = num_post[pdf_id1 * post_stride + s]; + + BaseFloat variable_factor0 = transition_prob0 * next_beta_j0 * prob0, + variable_factor1 = transition_prob1 * next_beta_j1 * prob1; + tot_beta_smbr += (next_beta_smbr_j0 + num_post0) * variable_factor0 + + (next_beta_smbr_j1 + num_post1) * variable_factor1; + tot_variable_factor += variable_factor0 + variable_factor1; + BaseFloat this_gamma_r0 = occupation_factor * variable_factor0 + * (this_alpha_smbr_i + num_post0 + next_beta_smbr_j0 - tot_smbr[s]); + atomic_add_thresholded(log_prob_deriv + (pdf_id0 * log_prob_deriv_stride + s), + this_gamma_r0); + BaseFloat this_gamma_r1 = occupation_factor * variable_factor1 + * (this_alpha_smbr_i + num_post1 + next_beta_smbr_j1 - tot_smbr[s]); + atomic_add_thresholded(log_prob_deriv + (pdf_id1 * log_prob_deriv_stride + s), + this_gamma_r1); + } + if (trans_iter != trans_end) { + // mop up the odd transition. + BaseFloat transition_prob0 = trans_iter[0].transition_prob; + int32_cuda pdf_id0 = trans_iter[0].pdf_id, + next_hmm_state0 = trans_iter[0].hmm_state; + BaseFloat next_beta_j0 = next_beta[next_hmm_state0 * num_sequences + s], + next_beta_smbr_j0 = next_beta_smbr[next_hmm_state0 * num_sequences + s], + prob0 = probs[pdf_id0 * prob_stride + s], + num_post0 = num_post[pdf_id0 * post_stride + s]; + BaseFloat variable_factor0 = transition_prob0 * next_beta_j0 * prob0; + tot_beta_smbr += (next_beta_smbr_j0 + num_post0) * variable_factor0; + tot_variable_factor += variable_factor0; + BaseFloat this_gamma_r0 = occupation_factor * variable_factor0 + * (this_alpha_smbr_i + num_post0 + next_beta_smbr_j0 - tot_smbr[s]); + atomic_add_thresholded(log_prob_deriv + (pdf_id0 * log_prob_deriv_stride + s), + this_gamma_r0); + } + BaseFloat beta = tot_variable_factor / inv_arbitrary_scale; + this_beta[h * num_sequences + s] = beta; + if (tot_variable_factor > 0.0) + this_beta_smbr[h * num_sequences + s] = + tot_beta_smbr / tot_variable_factor; + else + this_beta_smbr[h * num_sequences + s] = 0.0; +} + + +// Chain forward with SMBR objective +void cuda_chain_smbr_hmm_forward( + dim3 Gr, dim3 Bl, + const Int32Pair *backward_transitions, + const DenominatorGraphTransition *transitions, + int32_cuda num_sequences, + int32_cuda num_hmm_states, + const BaseFloat *probs, int32_cuda prob_stride, + const BaseFloat *num_post, int32_cuda post_stride, + const BaseFloat *prev_alpha, const BaseFloat *prev_alpha_smbr, + BaseFloat *this_alpha, BaseFloat *this_alpha_smbr) { + _cuda_chain_smbr_hmm_forward<<>>( + backward_transitions, transitions, + num_sequences, num_hmm_states, + probs, prob_stride, num_post, post_stride, + prev_alpha, prev_alpha_smbr, this_alpha, this_alpha_smbr); +} + +void cuda_chain_smbr_hmm_backward( + dim3 Gr, dim3 Bl, + const Int32Pair *forward_transitions, + const DenominatorGraphTransition *transitions, + int32_cuda num_sequences, + int32_cuda num_hmm_states, + const BaseFloat *probs, int32_cuda prob_stride, + const BaseFloat *num_post, int32_cuda post_stride, + const BaseFloat *tot_smbr, + const BaseFloat *this_alpha, const BaseFloat *this_alpha_smbr, + const BaseFloat *next_beta, const BaseFloat *next_beta_smbr, + BaseFloat *this_beta, BaseFloat *this_beta_smbr, + BaseFloat *log_prob_deriv, + int32_cuda log_prob_deriv_stride) { + _cuda_chain_smbr_hmm_backward<<>>( + forward_transitions, transitions, + num_sequences, num_hmm_states, + probs, prob_stride, num_post, post_stride, tot_smbr, + this_alpha, this_alpha_smbr, next_beta, next_beta_smbr, + this_beta, this_beta_smbr, log_prob_deriv, + log_prob_deriv_stride); +} From 55d3321c958c64055b680e7e6ea6f3b773c9b892 Mon Sep 17 00:00:00 2001 From: Hossein Hadian Date: Thu, 29 Jun 2017 15:01:39 -0400 Subject: [PATCH 022/174] Add phone-insertion-penalty + minor updates --- .../s5/local/chain/run_semisupervised.sh | 13 +++- egs/fisher_english/s5/local/chain/run_tdnn.sh | 64 +++++++++++++------ egs/wsj/s5/steps/nnet3/chain/get_egs.sh | 6 +- 3 files changed, 61 insertions(+), 22 deletions(-) diff --git a/egs/fisher_english/s5/local/chain/run_semisupervised.sh b/egs/fisher_english/s5/local/chain/run_semisupervised.sh index 50c9b04cf48..77ae92e49b6 100755 --- a/egs/fisher_english/s5/local/chain/run_semisupervised.sh +++ b/egs/fisher_english/s5/local/chain/run_semisupervised.sh @@ -18,14 +18,15 @@ train_supervised_opts="--stage -10 --train-stage -10" decode_affix= egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir comb_affix=_comb1a # affix for new chain-model directory trained on the combined supervised+unsupervised subsets -unsup_frames_per_eg= # if empty will be equal to the supervised model's config +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly unsup_egs_weight=1.0 lattice_lm_scale=0.0 # lm-scale for using the weights from unsupervised lattices lattice_prune_beam=2.0 # If supplied will prune the lattices prior to getting egs for unsupervised data left_tolerance=2 right_tolerance=2 train_combined_opts="--num-epochs 4.5" - +graph_affix= # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= # to tune: # frames_per_eg for unsupervised @@ -37,6 +38,7 @@ echo "$0 $@" # Print the command line for logging . ./utils/parse_options.sh nnet3_affix=_semi${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} if ! cuda-compiled; then cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 - # create the config files for nnet initialization - steps/nnet3/tdnn/make_configs.py \ - --self-repair-scale-nonlinearity 0.00001 \ - --feat-dir $train_data_dir \ - --ivector-dir $train_ivector_dir \ - --tree-dir $treedir \ - --relu-dim 725 \ - --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0 0" \ - --use-presoftmax-prior-scale false \ - --xent-regularize 0.1 \ - --xent-separate-forward-affine true \ - --include-log-softmax false \ - --final-layer-normalize-target 0.5 \ - $dir/configs || exit 1; +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ fi if [ $stage -le 13 ]; then @@ -125,7 +153,7 @@ if [ $stage -le 13 ]; then --egs.stage $get_egs_stage \ --egs.opts "--frames-overlap-per-eg 0" \ --egs.chunk-width 150 \ - --trainer.num-chunk-per-minibatch 128 \ + --trainer.num-chunk-per-minibatch $minibatch_size \ --trainer.frames-per-iter 1500000 \ --trainer.num-epochs $num_epochs \ --trainer.optimization.num-jobs-initial 3 \ @@ -140,15 +168,15 @@ if [ $stage -le 13 ]; then --dir $dir || exit 1; fi +graph_dir=$dir/graph if [ $stage -le 14 ]; then # Note: it might appear that this $lang directory is mismatched, and it is as # far as the 'topo' is concerned, but this script doesn't read the 'topo' from # the lang directory. - utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $dir/graph + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir fi decode_suff= -graph_dir=$dir/graph if [ $stage -le 15 ]; then iter_opts= if [ ! -z $decode_iter ]; then diff --git a/egs/wsj/s5/steps/nnet3/chain/get_egs.sh b/egs/wsj/s5/steps/nnet3/chain/get_egs.sh index a448309ead4..076dc95b2d7 100755 --- a/egs/wsj/s5/steps/nnet3/chain/get_egs.sh +++ b/egs/wsj/s5/steps/nnet3/chain/get_egs.sh @@ -70,7 +70,8 @@ egs_weight=1.0 # The weight which determines how much each training example # contributes to gradients while training (can be used # to down/up-weight a dataset) lattice_prune_beam= # If supplied, the lattices will be pruned to this beam, - # before being used to get supervisions. + # before being used to get supervisions. +phone_insertion_penalty= echo "$0 $@" # Print the command line for logging @@ -319,6 +320,9 @@ chain_supervision_all_opts="--lattice-input=true --frame-subsampling-factor=$ali [ ! -z $lattice_lm_scale ] && \ chain_supervision_all_opts="$chain_supervision_all_opts --lm-scale=$lattice_lm_scale" +[ ! -z $phone_insertion_penalty ] && \ + chain_supervision_all_opts="$chain_supervision_all_opts --phone-ins-penalty=$phone_insertion_penalty" + echo $left_context > $dir/info/left_context echo $right_context > $dir/info/right_context echo $left_context_initial > $dir/info/left_context_initial From f776b3aabbcdd62f12ff148edd90108d032d79fb Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Fri, 30 Jun 2017 14:54:49 -0400 Subject: [PATCH 023/174] Minor bug fixes --- .../nnet3/train/chain_objf/acoustic_model.py | 20 +++++++++++-------- src/chainbin/nnet3-chain-combine.cc | 2 +- src/nnet3/nnet-chain-diagnostics.cc | 10 ++++------ src/nnet3/nnet-chain-diagnostics.h | 3 +-- 4 files changed, 18 insertions(+), 17 deletions(-) diff --git a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py index 302e99d0d73..1ccb456c72b 100644 --- a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py +++ b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py @@ -179,7 +179,7 @@ def train_new_models(dir, iter, srand, num_jobs, thread = common_lib.background_command( """{command} {train_queue_opt} {dir}/log/train.{iter}.{job}.log \ nnet3-chain-train {parallel_train_opts} {verbose_opt} \ - --apply-deriv-weights={app_deriv_wts} \ + --apply-deriv-weights={app_deriv_wts} {smbr_opt} \ --l2-regularize={l2} --leaky-hmm-coefficient={leaky} \ {cache_io_opts} --xent-regularize={xent_reg} \ {deriv_time_opts} \ @@ -302,10 +302,11 @@ def train_one_iteration(dir, iter, srand, egs_dir, if shrinkage_value != 1.0: shrink_info_str = ' and shrink value is {0}'.format(shrinkage_value) + objf_info = "" if not use_smbr_objective else "and objective is sMBR" logger.info("On iteration {0}, learning rate is {1}" - "{shrink_info}.".format( + "{shrink_info} {objf_info}.".format( iter, learning_rate, - shrink_info=shrink_info_str)) + shrink_info=shrink_info_str, objf_info=objf_info)) train_new_models(dir=dir, iter=iter, srand=srand, num_jobs=num_jobs, num_archives_processed=num_archives_processed, @@ -462,7 +463,7 @@ def compute_train_cv_probabilities(dir, iter, egs_dir, l2_regularize, common_lib.background_command( """{command} {dir}/log/compute_prob_valid.{iter}.log \ - nnet3-chain-compute-prob --l2-regularize={l2} \ + nnet3-chain-compute-prob --l2-regularize={l2} {smbr_opt} \ --leaky-hmm-coefficient={leaky} --xent-regularize={xent_reg} \ "nnet3-am-copy --raw=true {model} - |" {dir}/den.fst \ "ark,bg:nnet3-chain-copy-egs ark:{egs_dir}/valid_diagnostic.cegs \ @@ -477,7 +478,7 @@ def compute_train_cv_probabilities(dir, iter, egs_dir, l2_regularize, common_lib.background_command( """{command} {dir}/log/compute_prob_train.{iter}.log \ - nnet3-chain-compute-prob --l2-regularize={l2} \ + nnet3-chain-compute-prob --l2-regularize={l2} {smbr_opt} \ --leaky-hmm-coefficient={leaky} --xent-regularize={xent_reg} \ "nnet3-am-copy --raw=true {model} - |" {dir}/den.fst \ "ark,bg:nnet3-chain-copy-egs ark:{egs_dir}/train_diagnostic.cegs \ @@ -507,10 +508,11 @@ def compute_progress(dir, iter, run_opts): model=model, prev_model=prev_model)) + def combine_models(dir, num_iters, models_to_combine, num_chunk_per_minibatch_str, egs_dir, leaky_hmm_coefficient, l2_regularize, xent_regularize, run_opts, - sum_to_one_penalty=0.0): + sum_to_one_penalty=0.0, use_smbr_objective=False): """ Function to do model combination In the nnet3 setup, the logic @@ -546,7 +548,7 @@ def combine_models(dir, num_iters, models_to_combine, num_chunk_per_minibatch_st common_lib.execute_command( """{command} {combine_queue_opt} {dir}/log/combine.log \ - nnet3-chain-combine --num-iters={opt_iters} \ + nnet3-chain-combine --num-iters={opt_iters} {smbr_opt} \ --l2-regularize={l2} --leaky-hmm-coefficient={leaky} \ --separate-weights-per-component={separate_weights} \ --enforce-sum-to-one={hard_enforce} \ @@ -568,7 +570,9 @@ def combine_models(dir, num_iters, models_to_combine, num_chunk_per_minibatch_st penalty=sum_to_one_penalty, num_chunk_per_mb=num_chunk_per_minibatch_str, num_iters=num_iters, - egs_dir=egs_dir)) + egs_dir=egs_dir, + smbr_opts="--use-smbr-objective" if use_smbr_objective + else "")) # Compute the probability of the final, combined model with # the same subset we used for the previous compute_probs, as the diff --git a/src/chainbin/nnet3-chain-combine.cc b/src/chainbin/nnet3-chain-combine.cc index a5ef5ac04e8..3c44e6b904c 100644 --- a/src/chainbin/nnet3-chain-combine.cc +++ b/src/chainbin/nnet3-chain-combine.cc @@ -117,7 +117,7 @@ int main(int argc, char *argv[]) { nnet = combiner.GetNnet(); if (HasBatchnorm(nnet)) - RecomputeStats(egs, chain_config, den_fst, den_fst_to_output, &nnet); + RecomputeStats(egs, chain_config, den_fst, &nnet); #if HAVE_CUDA==1 CuDevice::Instantiate().PrintProfile(); diff --git a/src/nnet3/nnet-chain-diagnostics.cc b/src/nnet3/nnet-chain-diagnostics.cc index 95d38d57679..1b1c630d4dc 100644 --- a/src/nnet3/nnet-chain-diagnostics.cc +++ b/src/nnet3/nnet-chain-diagnostics.cc @@ -54,14 +54,13 @@ NnetChainComputeProb::NnetChainComputeProb( Nnet *nnet): nnet_config_(nnet_config), chain_config_(chain_config), + den_graph_(den_fst, nnet->OutputDim("output")), nnet_(*nnet), compiler_(*nnet, nnet_config_.optimize_config, nnet_config_.compiler_config), deriv_nnet_owned_(false), deriv_nnet_(nnet), num_minibatches_processed_(0) { - chain::DenominatorGraph den_graph(den_fst, nnet->OutputDim("output")); - KALDI_ASSERT(den_graph.NumPdfs() > 0); - den_graph_.insert(std::make_pair("output", den_graph)); + KALDI_ASSERT(den_graph_.NumPdfs() > 0); KALDI_ASSERT(nnet_config.store_component_stats && !nnet_config.compute_deriv); } @@ -229,8 +228,7 @@ const ChainObjectiveInfo* NnetChainComputeProb::GetObjective( void RecomputeStats(const std::vector &egs, const chain::ChainTrainingOptions &chain_config_in, - const std::vector &den_fst, - const std::vector &den_to_output, + const fst::StdVectorFst &den_fst, Nnet *nnet) { KALDI_LOG << "Recomputing stats on nnet (affects batch-norm)"; chain::ChainTrainingOptions chain_config(chain_config_in); @@ -246,7 +244,7 @@ void RecomputeStats(const std::vector &egs, NnetComputeProbOptions nnet_config; nnet_config.store_component_stats = true; NnetChainComputeProb prob_computer(nnet_config, chain_config, den_fst, - den_to_output, *nnet); + *nnet); for (size_t i = 0; i < egs.size(); i++) prob_computer.Compute(egs[i]); prob_computer.PrintTotalStats(); diff --git a/src/nnet3/nnet-chain-diagnostics.h b/src/nnet3/nnet-chain-diagnostics.h index 047fed55dbb..4125427c463 100644 --- a/src/nnet3/nnet-chain-diagnostics.h +++ b/src/nnet3/nnet-chain-diagnostics.h @@ -111,8 +111,7 @@ class NnetChainComputeProb { /// declared in nnet-utils.h. void RecomputeStats(const std::vector &egs, const chain::ChainTrainingOptions &chain_config, - const std::vector &den_fst, - const std::vector &den_to_output, + const fst::StdVectorFst &den_fst, Nnet *nnet); From 845f27b7e8e63c6fd98e6541877803dc398cd84b Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Wed, 12 Jul 2017 15:20:44 -0400 Subject: [PATCH 024/174] chain-smbr: Adding smbr --- .../nnet3/train/chain_objf/acoustic_model.py | 11 ++-- egs/wsj/s5/steps/libs/nnet3/train/common.py | 5 +- egs/wsj/s5/steps/nnet3/chain/train.py | 50 ++++++++++++++++--- .../s5/steps/nnet3/report/generate_plots.py | 1 + src/chain/chain-denominator-smbr.cc | 1 - src/chain/chain-smbr-kernels.cu | 6 +-- src/chain/chain-supervision-test.cc | 8 +-- src/chain/chain-training.cc | 3 +- src/chain/language-model-test.cc | 2 +- src/chainbin/nnet3-chain-copy-egs.cc | 8 ++- src/nnet3/nnet-chain-example.cc | 22 ++++++++ src/nnet3/nnet-chain-example.h | 9 ++++ src/nnet3/nnet-chain-training.cc | 15 ++++++ src/nnet3/nnet-chain-training.h | 6 ++- src/nnet3/nnet-training.cc | 7 +++ src/nnet3/nnet-training.h | 2 + 16 files changed, 130 insertions(+), 26 deletions(-) diff --git a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py index 1ccb456c72b..fc49f9fedff 100644 --- a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py +++ b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py @@ -130,7 +130,7 @@ def train_new_models(dir, iter, srand, num_jobs, l2_regularize, xent_regularize, leaky_hmm_coefficient, momentum, max_param_change, shuffle_buffer_size, num_chunk_per_minibatch_str, - frame_subsampling_factor, run_opts, + frame_subsampling_factor, truncate_deriv_weights, run_opts, backstitch_training_scale=0.0, backstitch_training_interval=1, use_smbr_objective=False): """ @@ -190,6 +190,7 @@ def train_new_models(dir, iter, srand, num_jobs, --srand={srand} \ "{raw_model}" {dir}/den.fst \ "ark,bg:nnet3-chain-copy-egs \ + --truncate-deriv-weights={trunc_deriv} \ --frame-shift={fr_shft} \ ark:{egs_dir}/cegs.{archive_index}.ark ark:- | \ nnet3-chain-shuffle-egs --buffer-size={buf_size} \ @@ -201,6 +202,7 @@ def train_new_models(dir, iter, srand, num_jobs, dir=dir, iter=iter, srand=iter + srand, next_iter=iter + 1, job=job, deriv_time_opts=" ".join(deriv_time_opts), + trunc_deriv=truncate_deriv_weights, app_deriv_wts=apply_deriv_weights, fr_shft=frame_shift, l2=l2_regularize, xent_reg=xent_regularize, leaky=leaky_hmm_coefficient, @@ -235,7 +237,7 @@ def train_one_iteration(dir, iter, srand, egs_dir, l2_regularize, xent_regularize, leaky_hmm_coefficient, momentum, max_param_change, shuffle_buffer_size, - frame_subsampling_factor, + frame_subsampling_factor, truncate_deriv_weights, run_opts, dropout_edit_string="", backstitch_training_scale=0.0, backstitch_training_interval=1, use_smbr_objective=False): @@ -324,6 +326,7 @@ def train_one_iteration(dir, iter, srand, egs_dir, shuffle_buffer_size=shuffle_buffer_size, num_chunk_per_minibatch_str=cur_num_chunk_per_minibatch_str, frame_subsampling_factor=frame_subsampling_factor, + truncate_deriv_weights=truncate_deriv_weights, run_opts=run_opts, # linearly increase backstitch_training_scale during the # first few iterations (hard-coded as 15) @@ -571,8 +574,8 @@ def combine_models(dir, num_iters, models_to_combine, num_chunk_per_minibatch_st num_chunk_per_mb=num_chunk_per_minibatch_str, num_iters=num_iters, egs_dir=egs_dir, - smbr_opts="--use-smbr-objective" if use_smbr_objective - else "")) + smbr_opt="--use-smbr-objective" if use_smbr_objective + else "")) # Compute the probability of the final, combined model with # the same subset we used for the previous compute_probs, as the diff --git a/egs/wsj/s5/steps/libs/nnet3/train/common.py b/egs/wsj/s5/steps/libs/nnet3/train/common.py index 08b3b4c0b35..45b863c8087 100644 --- a/egs/wsj/s5/steps/libs/nnet3/train/common.py +++ b/egs/wsj/s5/steps/libs/nnet3/train/common.py @@ -502,7 +502,8 @@ def smooth_presoftmax_prior_scale_vector(pdf_counts, def prepare_initial_network(dir, run_opts, srand=-3): - if os.path.exists(dir+"/configs/init.config"): + if os.path.exists(dir+"/configs/init.config") or os.path.exists( + "{0}/init.raw".format(dir)): common_lib.execute_command( """{command} {dir}/log/add_first_layer.log \ nnet3-init --srand={srand} {dir}/init.raw \ @@ -856,7 +857,7 @@ def __init__(self, sequentially.""") self.parser.add_argument("--trainer.optimization.backstitch-training-scale", type=float, dest='backstitch_training_scale', - default=0.0, help="""scale of parameters changes + default=0.0, help="""scale of parameters changes used in backstitch training step.""") self.parser.add_argument("--trainer.optimization.backstitch-training-interval", type=int, dest='backstitch_training_interval', diff --git a/egs/wsj/s5/steps/nnet3/chain/train.py b/egs/wsj/s5/steps/nnet3/chain/train.py index 9c35b320428..2bb86c18459 100755 --- a/egs/wsj/s5/steps/nnet3/chain/train.py +++ b/egs/wsj/s5/steps/nnet3/chain/train.py @@ -86,6 +86,11 @@ def get_args(): action=common_lib.StrToBoolAction, choices=["true", "false"], help="") + parser.add_argument("--chain.truncate-deriv-weights", type=int, + dest='truncate_deriv_weights', default=0, + help="""Can be used to set to zero the weights of + derivs from frames near the edges. (counts subsampled + frames)""") parser.add_argument("--chain.frame-subsampling-factor", type=int, dest='frame_subsampling_factor', default=3, help="ratio of frames-per-second of features we " @@ -102,6 +107,15 @@ def get_args(): parser.add_argument("--chain.smbr-start-fraction", type=float, dest='smbr_start_fraction', default=1.1, help="Fraction of training at which to start LF-SMBR") + parser.add_argument("--chain.smbr-learning-rate-factor", default=1.0, + dest='smbr_learning_rate_factor', type=float, + help="Learning rate factor used for sMBR training") + parser.add_argument("--chain.smbr-xent-regularize", default=None, + dest='smbr_xent_regularize', type=float, + help="Xent regularizer term used with sMBR training") + parser.add_argument("--chain.smbr-l2-regularize", default=None, + dest='smbr_l2_regularize', type=float, + help="L2 regularizer term used with sMBR training") # trainer options parser.add_argument("--trainer.num-epochs", type=float, dest='num_epochs', @@ -312,7 +326,9 @@ def train(args, run_opts): logger.info("Creating denominator FST") chain_lib.create_denominator_fst(args.dir, args.tree_dir, run_opts) - if (args.stage <= -4) and os.path.exists(args.dir+"/configs/init.config"): + if (args.stage <= -4 + and not os.path.exists(args.dir+"/init.raw") + and os.path.exists(args.dir+"/configs/init.config")): logger.info("Initializing a basic network for estimating " "preconditioning matrix") common_lib.execute_command( @@ -448,10 +464,19 @@ def train(args, run_opts): args.shrink_saturation_threshold) else shrinkage_value) + xent_regularize = args.xent_regularize + l2_regularize = args.l2_regularize use_smbr=False if (float(num_archives_processed) / num_archives_to_process - > args.smbr_start_fraction): + >= args.smbr_start_fraction): use_smbr=True + lrate *= args.smbr_learning_rate_factor + xent_regularize = (args.smbr_xent_regularize + if args.smbr_xent_regularize is not None + else args.xent_regularize) + l2_regularize = (args.smbr_l2_regularize + if args.smbr_l2_regularize is not None + else args.l2_regularize) chain_lib.train_one_iteration( dir=args.dir, @@ -471,13 +496,14 @@ def train(args, run_opts): apply_deriv_weights=args.apply_deriv_weights, min_deriv_time=min_deriv_time, max_deriv_time_relative=max_deriv_time_relative, - l2_regularize=args.l2_regularize, - xent_regularize=args.xent_regularize, + l2_regularize=l2_regularize, + xent_regularize=xent_regularize, leaky_hmm_coefficient=args.leaky_hmm_coefficient, momentum=args.momentum, max_param_change=args.max_param_change, shuffle_buffer_size=args.shuffle_buffer_size, frame_subsampling_factor=args.frame_subsampling_factor, + truncate_deriv_weights=args.truncate_deriv_weights, run_opts=run_opts, backstitch_training_scale=args.backstitch_training_scale, backstitch_training_interval=args.backstitch_training_interval, @@ -507,18 +533,26 @@ def train(args, run_opts): if args.stage <= num_iters: logger.info("Doing final combination to produce final.mdl") - use_smbr=False + xent_regularize = args.xent_regularize + l2_regularize = args.l2_regularize + use_smbr = False if (float(num_archives_processed) / num_archives_to_process - > args.smbr_start_fraction): + >= args.smbr_start_fraction): use_smbr=True + xent_regularize = (args.smbr_xent_regularize + if args.smbr_xent_regularize is not None + else args.xent_regularize) + l2_regularize = (args.smbr_l2_regularize + if args.smbr_l2_regularize is not None + else args.l2_regularize) chain_lib.combine_models( dir=args.dir, num_iters=num_iters, models_to_combine=models_to_combine, num_chunk_per_minibatch_str=args.num_chunk_per_minibatch, egs_dir=egs_dir, leaky_hmm_coefficient=args.leaky_hmm_coefficient, - l2_regularize=args.l2_regularize, - xent_regularize=args.xent_regularize, + l2_regularize=l2_regularize, + xent_regularize=xent_regularize, run_opts=run_opts, sum_to_one_penalty=args.combine_sum_to_one_penalty, use_smbr_objective=use_smbr) diff --git a/egs/wsj/s5/steps/nnet3/report/generate_plots.py b/egs/wsj/s5/steps/nnet3/report/generate_plots.py index 8ec283492ef..6f7987c425f 100755 --- a/egs/wsj/s5/steps/nnet3/report/generate_plots.py +++ b/egs/wsj/s5/steps/nnet3/report/generate_plots.py @@ -732,6 +732,7 @@ def main(): output_nodes.append(tuple(parts)) elif args.is_chain: output_nodes.append(('output', 'chain')) + output_nodes.append(('output-xent', 'chain')) else: output_nodes.append(('output', 'linear')) diff --git a/src/chain/chain-denominator-smbr.cc b/src/chain/chain-denominator-smbr.cc index 607be64decf..41a1110502a 100644 --- a/src/chain/chain-denominator-smbr.cc +++ b/src/chain/chain-denominator-smbr.cc @@ -230,7 +230,6 @@ void DenominatorSmbrComputation::AlphaSmbrDash(int32 t) { alpha_smbr_sum_vec.AddRowSumMat(1.0, alpha_smbr_mat, 0.0); BaseFloat alpha_sum = alpha_sum_vec.Sum(); - KALDI_VLOG(2) << "alpha-sum for time " << t << " is " << alpha_sum; KALDI_ASSERT(alpha_sum_vec.Min() > 0); alpha_smbr_mat.AddVecVec(opts_.leaky_hmm_coefficient, diff --git a/src/chain/chain-smbr-kernels.cu b/src/chain/chain-smbr-kernels.cu index 7aa33c4154b..cfcf19dffad 100644 --- a/src/chain/chain-smbr-kernels.cu +++ b/src/chain/chain-smbr-kernels.cu @@ -277,11 +277,11 @@ static void _cuda_chain_smbr_hmm_backward( tot_variable_factor += variable_factor0 + variable_factor1; BaseFloat this_gamma_r0 = occupation_factor * variable_factor0 * (this_alpha_smbr_i + num_post0 + next_beta_smbr_j0 - tot_smbr[s]); - atomic_add_thresholded(log_prob_deriv + (pdf_id0 * log_prob_deriv_stride + s), + atomic_add(log_prob_deriv + (pdf_id0 * log_prob_deriv_stride + s), this_gamma_r0); BaseFloat this_gamma_r1 = occupation_factor * variable_factor1 * (this_alpha_smbr_i + num_post1 + next_beta_smbr_j1 - tot_smbr[s]); - atomic_add_thresholded(log_prob_deriv + (pdf_id1 * log_prob_deriv_stride + s), + atomic_add(log_prob_deriv + (pdf_id1 * log_prob_deriv_stride + s), this_gamma_r1); } if (trans_iter != trans_end) { @@ -298,7 +298,7 @@ static void _cuda_chain_smbr_hmm_backward( tot_variable_factor += variable_factor0; BaseFloat this_gamma_r0 = occupation_factor * variable_factor0 * (this_alpha_smbr_i + num_post0 + next_beta_smbr_j0 - tot_smbr[s]); - atomic_add_thresholded(log_prob_deriv + (pdf_id0 * log_prob_deriv_stride + s), + atomic_add(log_prob_deriv + (pdf_id0 * log_prob_deriv_stride + s), this_gamma_r0); } BaseFloat beta = tot_variable_factor / inv_arbitrary_scale; diff --git a/src/chain/chain-supervision-test.cc b/src/chain/chain-supervision-test.cc index 7674f50d040..eb5e263427a 100644 --- a/src/chain/chain-supervision-test.cc +++ b/src/chain/chain-supervision-test.cc @@ -710,7 +710,7 @@ void ChainSupervisionSimpleTest() { supervision.Check(*trans_model); TestSupervisionIo(supervision); TestSupervisionSplitting(*ctx_dep, *trans_model, supervision); - //TestSupervisionAppend(*trans_model, supervision); + TestSupervisionAppend(*trans_model, supervision); { fst::StdVectorFst den_fst; @@ -812,11 +812,11 @@ int main() { CuDevice::Instantiate().SelectGpuId("yes"); #endif for (int32 i = 0; i < 3; i++) { - //kaldi::chain::ChainSupervisionTest(); + kaldi::chain::ChainSupervisionTest(); kaldi::chain::ChainSupervisionSimpleTest(); - //kaldi::chain::BreadthFirstTest(); + kaldi::chain::BreadthFirstTest(); } - //kaldi::chain::TestRanges(); + kaldi::chain::TestRanges(); #if HAVE_CUDA == 1 } CuDevice::Instantiate().PrintProfile(); diff --git a/src/chain/chain-training.cc b/src/chain/chain-training.cc index 3aebe3bfcfb..87e5829bfd3 100644 --- a/src/chain/chain-training.cc +++ b/src/chain/chain-training.cc @@ -133,8 +133,9 @@ void ComputeChainSmbrObjfAndDeriv(const ChainTrainingOptions &opts, // the numerator object, and the logprob too. numerator.Forward(); numerator.Backward(&num_posteriors); - if (xent_output_deriv) + if (xent_output_deriv) { xent_output_deriv->CopyFromMat(num_posteriors); + } } DenominatorSmbrComputation denominator(opts, den_graph, supervision.num_sequences, diff --git a/src/chain/language-model-test.cc b/src/chain/language-model-test.cc index 286b3afc115..04a57441ada 100644 --- a/src/chain/language-model-test.cc +++ b/src/chain/language-model-test.cc @@ -86,7 +86,7 @@ void LanguageModelTest() { LanguageModelEstimator estimator(opts); for (size_t i = 0; i < data.size(); i++) { std::vector &sentence = data[i]; - estimator.AddCounts(sentence, 1); + estimator.AddCounts(sentence); } fst::StdVectorFst fst; diff --git a/src/chainbin/nnet3-chain-copy-egs.cc b/src/chainbin/nnet3-chain-copy-egs.cc index 4f26e145ac5..3e20becc59e 100644 --- a/src/chainbin/nnet3-chain-copy-egs.cc +++ b/src/chainbin/nnet3-chain-copy-egs.cc @@ -265,6 +265,7 @@ int main(int argc, char *argv[]) { bool random = false; int32 srand_seed = 0; int32 frame_shift = 0; + int32 truncate_deriv_weights = 0; int32 frame_subsampling_factor = -1; BaseFloat keep_proportion = 1.0; int32 left_context = -1, right_context = -1; @@ -281,6 +282,9 @@ int main(int argc, char *argv[]) { "in the supervision data (excluding iVector data) - useful in " "augmenting data. Note, the outputs will remain at the closest " "exact multiples of the frame subsampling factor"); + po.Register("truncate-deriv-weights", &truncate_deriv_weights, + "If nonzero, the number of initial/final subsample frames that " + "will have their derivatives' weights set to zero."); po.Register("left-context", &left_context, "Can be used to truncate the " "feature left-context that we output."); po.Register("right-context", &right_context, "Can be used to truncate the " @@ -316,7 +320,7 @@ int main(int argc, char *argv[]) { // count is normally 1; could be 0, or possibly >1. int32 count = GetCount(keep_proportion); std::string key = example_reader.Key(); - if (frame_shift == 0 && + if (frame_shift == 0 && truncate_deriv_weights == 0 && left_context == -1 && right_context == -1) { const NnetChainExample &eg = example_reader.Value(); for (int32 c = 0; c < count; c++) { @@ -334,6 +338,8 @@ int main(int argc, char *argv[]) { frame_subsampling_factor, &eg_out); else eg_out.Swap(&eg); + if (truncate_deriv_weights != 0) + TruncateDerivWeights(truncate_deriv_weights, &eg_out); for (int32 c = 0; c < count; c++) { int32 index = (random ? Rand() : num_written) % num_outputs; example_writers[index]->Write(key, eg_out); diff --git a/src/nnet3/nnet-chain-example.cc b/src/nnet3/nnet-chain-example.cc index 351312fb952..ea703a99d52 100644 --- a/src/nnet3/nnet-chain-example.cc +++ b/src/nnet3/nnet-chain-example.cc @@ -290,6 +290,28 @@ void MergeChainExamples(bool compress, } } +void TruncateDerivWeights(int32 truncate, + NnetChainExample *eg) { + for (size_t i = 0; i < eg->outputs.size(); i++) { + NnetChainSupervision &supervision = eg->outputs[i]; + Vector &deriv_weights = supervision.deriv_weights; + if (deriv_weights.Dim() == 0) { + deriv_weights.Resize(supervision.indexes.size()); + deriv_weights.Set(1.0); + } + int32 num_sequences = supervision.supervision.num_sequences, + frames_per_sequence = supervision.supervision.frames_per_sequence; + KALDI_ASSERT(2 * truncate < frames_per_sequence); + for (int32 t = 0; t < truncate; t++) + for (int32 s = 0; s < num_sequences; s++) + deriv_weights(t * num_sequences + s) = 0.0; + for (int32 t = frames_per_sequence - truncate; + t < frames_per_sequence; t++) + for (int32 s = 0; s < num_sequences; s++) + deriv_weights(t * num_sequences + s) = 0.0; + } +} + void GetChainComputationRequest(const Nnet &nnet, const NnetChainExample &eg, bool need_model_derivative, diff --git a/src/nnet3/nnet-chain-example.h b/src/nnet3/nnet-chain-example.h index 2718af746b2..a8ef5dada2d 100644 --- a/src/nnet3/nnet-chain-example.h +++ b/src/nnet3/nnet-chain-example.h @@ -189,6 +189,15 @@ void ShiftChainExampleTimes(int32 frame_shift, const std::vector &exclude_names, NnetChainExample *eg); +/** + This sets to zero any elements of 'egs->outputs[*].deriv_weights' that correspond + to frames within the first or last 'truncate' frames of the sequence (e.g. you could + set 'truncate=5' to set zero deriv-weight for the first and last 5 frames of the + sequence). + */ +void TruncateDerivWeights(int32 truncate, + NnetChainExample *eg); + /** This function takes a NnetChainExample and produces a ComputationRequest. Assumes you don't want the derivatives w.r.t. the inputs; if you do, you can create the ComputationRequest manually. Assumes that if diff --git a/src/nnet3/nnet-chain-training.cc b/src/nnet3/nnet-chain-training.cc index 7ce2218b080..c8f58bedebb 100644 --- a/src/nnet3/nnet-chain-training.cc +++ b/src/nnet3/nnet-chain-training.cc @@ -214,6 +214,14 @@ void NnetChainTrainer::ProcessOutputs(bool is_backstitch_step2, xent_deriv.MulRowsVec(cu_deriv_weights); } + if (opts_.accumulate_avg_deriv && + objf_info_[sup.name + suffix].deriv_sum.Dim() == 0) + objf_info_[sup.name + suffix].deriv_sum.Resize(nnet_output.NumCols()); + + if (objf_info_[sup.name + suffix].deriv_sum.Dim() > 0) + objf_info_[sup.name + suffix].deriv_sum.AddRowSumMat( + 1.0, nnet_output_deriv, 1.0); + computer->AcceptInput(sup.name, &nnet_output_deriv); objf_info_[sup.name + suffix].UpdateStats(sup.name + suffix, @@ -223,6 +231,12 @@ void NnetChainTrainer::ProcessOutputs(bool is_backstitch_step2, if (use_xent) { xent_deriv.Scale(opts_.chain_config.xent_regularize); + if (opts_.accumulate_avg_deriv && + objf_info_[xent_name + suffix].deriv_sum.Dim() == 0) + objf_info_[xent_name + suffix].deriv_sum.Resize(nnet_output.NumCols()); + if (objf_info_[xent_name + suffix].deriv_sum.Dim() > 0) + objf_info_[xent_name + suffix].deriv_sum.AddRowSumMat( + 1.0, xent_deriv, 1.0); computer->AcceptInput(xent_name, &xent_deriv); } } @@ -236,6 +250,7 @@ bool NnetChainTrainer::PrintTotalStats() const { for (; iter != end; ++iter) { const std::string &name = iter->first; const ObjectiveFunctionInfo &info = iter->second; + ans = info.PrintTotalStats(name) || ans; } PrintMaxChangeStats(); diff --git a/src/nnet3/nnet-chain-training.h b/src/nnet3/nnet-chain-training.h index 5bf6a3f6fce..b76608e2794 100644 --- a/src/nnet3/nnet-chain-training.h +++ b/src/nnet3/nnet-chain-training.h @@ -36,7 +36,8 @@ struct NnetChainTrainingOptions { NnetTrainerOptions nnet_config; chain::ChainTrainingOptions chain_config; bool apply_deriv_weights; - NnetChainTrainingOptions(): apply_deriv_weights(true) { } + bool accumulate_avg_deriv; + NnetChainTrainingOptions(): apply_deriv_weights(true), accumulate_avg_deriv(true) { } void Register(OptionsItf *opts) { nnet_config.Register(opts); @@ -44,6 +45,9 @@ struct NnetChainTrainingOptions { opts->Register("apply-deriv-weights", &apply_deriv_weights, "If true, apply the per-frame derivative weights stored with " "the example"); + opts->Register("accumulate-avg-deriv", &accumulate_avg_deriv, + "If true, the average derivative will be accumulated and " + "printed"); } }; diff --git a/src/nnet3/nnet-training.cc b/src/nnet3/nnet-training.cc index 6adac4c8182..7428818014c 100644 --- a/src/nnet3/nnet-training.cc +++ b/src/nnet3/nnet-training.cc @@ -296,6 +296,13 @@ bool ObjectiveFunctionInfo::PrintTotalStats(const std::string &name) const { << objf << " + " << aux_objf << " = " << sum_objf << " over " << tot_weight << " frames."; } + + if (deriv_sum.Dim() > 0) { + Vector deriv_avg(deriv_sum); + deriv_avg.Scale(1.0 / tot_weight); + KALDI_LOG << "Overall avg deriv for " << name << " is " << deriv_avg; + } + KALDI_LOG << "[this line is to be parsed by a script:] " << "log-prob-per-frame=" << objf; diff --git a/src/nnet3/nnet-training.h b/src/nnet3/nnet-training.h index b047e61e1fb..42837df1f86 100644 --- a/src/nnet3/nnet-training.h +++ b/src/nnet3/nnet-training.h @@ -118,6 +118,8 @@ struct ObjectiveFunctionInfo { double tot_objf_this_phase; double tot_aux_objf_this_phase; + CuVector deriv_sum; + ObjectiveFunctionInfo(): current_phase(0), minibatches_this_phase(0), From 545154a13a684d50f72eecfd76cd997ada4129a2 Mon Sep 17 00:00:00 2001 From: Pegita Date: Fri, 14 Jul 2017 14:58:11 -0400 Subject: [PATCH 025/174] added scripts for new weight transfer method for transferring all layers. --- egs/rm/s5/local/chain/run_tdnn_wsj_rm.sh | 193 +----- .../s5/local/chain/tuning/run_tdnn_wsj_rm.sh | 192 ++++++ .../local/chain/tuning/run_tdnn_wsj_rm_1a.sh | 192 ++++++ .../local/chain/tuning/run_tdnn_wsj_rm_1b.sh | 211 +++++++ .../local/chain/tuning/run_tdnn_wsj_rm_1c.sh | 201 ++++++ egs/rm/s5/local/online/run_nnet2_common.sh | 15 +- egs/rm/s5/local/prepare_wsj_rm_lang.sh | 65 ++ egs/wsj/s5/steps/align_basis_fmllr.sh | 10 +- .../nnet3/train/chain_objf/acoustic_model.py | 58 +- egs/wsj/s5/steps/nnet3/chain/train.py | 57 +- egs/wsj/s5/steps/nnet3/chain/train_more.py | 590 ++++++++++++++++++ src/bin/Makefile | 2 +- src/chain/language-model.cc | 11 +- src/chain/language-model.h | 6 +- src/chainbin/chain-est-phone-lm.cc | 32 +- 15 files changed, 1559 insertions(+), 276 deletions(-) mode change 100755 => 120000 egs/rm/s5/local/chain/run_tdnn_wsj_rm.sh create mode 100755 egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm.sh create mode 100755 egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1a.sh create mode 100755 egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1b.sh create mode 100755 egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1c.sh create mode 100755 egs/rm/s5/local/prepare_wsj_rm_lang.sh create mode 100755 egs/wsj/s5/steps/nnet3/chain/train_more.py diff --git a/egs/rm/s5/local/chain/run_tdnn_wsj_rm.sh b/egs/rm/s5/local/chain/run_tdnn_wsj_rm.sh deleted file mode 100755 index 1e78f2b90ec..00000000000 --- a/egs/rm/s5/local/chain/run_tdnn_wsj_rm.sh +++ /dev/null @@ -1,192 +0,0 @@ -#!/bin/bash - -# This script uses weight transfer as Transfer learning method -# and use already trained model on wsj and remove the last layer and -# add new randomly initialized layer and retrain the whole network. -# while training new added layer using rm data. -# The chain config is as run_tdnn_5n.sh and the result is: -#System tdnn_5n tdnn_wsj_rm -#WER 2.71 2.21 -set -e - -# configs for 'chain' -stage=0 -train_stage=-10 -get_egs_stage=-10 -dir=exp/chain/tdnn_wsj_rm - -# training options -num_epochs=12 -initial_effective_lrate=0.005 -final_effective_lrate=0.0005 -leftmost_questions_truncate=-1 -max_param_change=2.0 -final_layer_normalize_target=0.5 -num_jobs_initial=2 -num_jobs_final=4 -minibatch_size=128 -frames_per_eg=150 -remove_egs=false -xent_regularize=0.1 - -# configs for transfer learning -srcdir=../../wsj/s5/ -common_egs_dir= -src_mdl=$srcdir/exp/chain/tdnn1d_sp/final.mdl -primary_lr_factor=0.25 -dim=450 -nnet_affix=_online -# End configuration section. - -echo "$0 $@" # Print the command line for logging - -. ./cmd.sh -. ./path.sh -. ./utils/parse_options.sh - -if ! cuda-compiled; then - cat <$lang/topo -fi - -if [ $stage -le 6 ]; then - # Build a tree using our new topology. - steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ - --leftmost-questions-truncate $leftmost_questions_truncate \ - --cmd "$train_cmd" 1200 data/train $lang $ali_dir $treedir -fi - -if [ $stage -le 7 ]; then - echo "$0: creating neural net configs using the xconfig parser for"; - echo "extra layers w.r.t source network."; - num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') - learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) - mkdir -p $dir - mkdir -p $dir/configs - cat < $dir/configs/network.xconfig - relu-renorm-layer name=tdnn7-target input=Append(tdnn6.renorm@-3,tdnn6.renorm@0) dim=$dim - ## adding the layers for chain branch - relu-renorm-layer name=prefinal-chain-target input=tdnn7-target dim=$dim target-rms=0.5 - output-layer name=output-target include-log-softmax=false dim=$num_targets max-change=1.5 - relu-renorm-layer name=prefinal-xent-target input=tdnn7-target dim=$dim target-rms=0.5 - output-layer name=output-xent-target dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 -EOF - # edits.config contains edits required to train transferred model. - # e.g. substitute output-node of previous model with new output - # and removing orphan nodes and components. - cat < $dir/configs/edits.config - remove-output-nodes name=output - remove-output-nodes name=output-xent - rename-node old-name=output-target new-name=output - rename-node old-name=output-xent-target new-name=output-xent - remove-orphans -EOF - steps/nnet3/xconfig_to_configs.py --existing-model $src_mdl \ - --xconfig-file $dir/configs/network.xconfig \ - --edits-config $dir/configs/edits.config \ - --config-dir $dir/configs/ -fi - -if [ $stage -le 8 ]; then - echo "$0: generate egs for chain to train new model on rm dataset." - if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then - utils/create_split_dir.pl \ - /export/b0{3,4,5,6}/$USER/kaldi-data/egs/rm-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage - fi - echo "$0: set the learning-rate-factor for initial network to be zero." - nnet3-am-copy --raw=true --edits="set-learning-rate-factor name=* learning-rate-factor=$primary_lr_factor" \ - $src_mdl $dir/init.raw || exit 1; - - steps/nnet3/chain/train.py --stage $train_stage \ - --cmd "$decode_cmd" \ - --feat.online-ivector-dir exp/nnet2${nnet_affix}/ivectors \ - --chain.xent-regularize $xent_regularize \ - --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ - --chain.xent-regularize 0.1 \ - --chain.leaky-hmm-coefficient 0.1 \ - --chain.l2-regularize 0.00005 \ - --chain.apply-deriv-weights false \ - --chain.lm-opts="--num-extra-lm-states=200" \ - --egs.dir "$common_egs_dir" \ - --egs.opts "--frames-overlap-per-eg 0" \ - --egs.chunk-width $frames_per_eg \ - --trainer.num-chunk-per-minibatch=$minibatch_size \ - --trainer.frames-per-iter 1000000 \ - --trainer.num-epochs $num_epochs \ - --trainer.optimization.num-jobs-initial=$num_jobs_initial \ - --trainer.optimization.num-jobs-final=$num_jobs_final \ - --trainer.optimization.initial-effective-lrate=$initial_effective_lrate \ - --trainer.optimization.final-effective-lrate=$final_effective_lrate \ - --trainer.max-param-change $max_param_change \ - --cleanup.remove-egs false \ - --feat-dir data/train_hires \ - --tree-dir $treedir \ - --lat-dir exp/tri3b_lats \ - --dir $dir || exit 1; -fi - -if [ $stage -le 9 ]; then - steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 4 \ - data/test $srcdir/exp/nnet3/extractor exp/nnet2${nnet_affix}/ivectors_test || exit 1; -fi - -if [ $stage -le 10 ]; then - # Note: it might appear that this $lang directory is mismatched, and it is as - # far as the 'topo' is concerned, but this script doesn't read the 'topo' from - # the lang directory. - utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph - steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ - --scoring-opts "--min-lmwt 1" \ - --nj 20 --cmd "$decode_cmd" \ - --online-ivector-dir exp/nnet2${nnet_affix}/ivectors_test \ - $dir/graph data/test_hires $dir/decode || exit 1; -fi - -if [ $stage -le 11 ]; then - utils/mkgraph.sh --self-loop-scale 1.0 data/lang_ug $dir $dir/graph_ug - steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ - --nj 20 --cmd "$decode_cmd" \ - --online-ivector-dir exp/nnet2${nnet_affix}/ivectors_test \ - $dir/graph_ug data/test_hires $dir/decode_ug || exit 1; -fi -wait; -exit 0; diff --git a/egs/rm/s5/local/chain/run_tdnn_wsj_rm.sh b/egs/rm/s5/local/chain/run_tdnn_wsj_rm.sh new file mode 120000 index 00000000000..df30d20b313 --- /dev/null +++ b/egs/rm/s5/local/chain/run_tdnn_wsj_rm.sh @@ -0,0 +1 @@ +tuning/run_tdnn_wsj_rm_1a.sh \ No newline at end of file diff --git a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm.sh b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm.sh new file mode 100755 index 00000000000..1e78f2b90ec --- /dev/null +++ b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm.sh @@ -0,0 +1,192 @@ +#!/bin/bash + +# This script uses weight transfer as Transfer learning method +# and use already trained model on wsj and remove the last layer and +# add new randomly initialized layer and retrain the whole network. +# while training new added layer using rm data. +# The chain config is as run_tdnn_5n.sh and the result is: +#System tdnn_5n tdnn_wsj_rm +#WER 2.71 2.21 +set -e + +# configs for 'chain' +stage=0 +train_stage=-10 +get_egs_stage=-10 +dir=exp/chain/tdnn_wsj_rm + +# training options +num_epochs=12 +initial_effective_lrate=0.005 +final_effective_lrate=0.0005 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=2 +num_jobs_final=4 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false +xent_regularize=0.1 + +# configs for transfer learning +srcdir=../../wsj/s5/ +common_egs_dir= +src_mdl=$srcdir/exp/chain/tdnn1d_sp/final.mdl +primary_lr_factor=0.25 +dim=450 +nnet_affix=_online +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 6 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 1200 data/train $lang $ali_dir $treedir +fi + +if [ $stage -le 7 ]; then + echo "$0: creating neural net configs using the xconfig parser for"; + echo "extra layers w.r.t source network."; + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + mkdir -p $dir + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + relu-renorm-layer name=tdnn7-target input=Append(tdnn6.renorm@-3,tdnn6.renorm@0) dim=$dim + ## adding the layers for chain branch + relu-renorm-layer name=prefinal-chain-target input=tdnn7-target dim=$dim target-rms=0.5 + output-layer name=output-target include-log-softmax=false dim=$num_targets max-change=1.5 + relu-renorm-layer name=prefinal-xent-target input=tdnn7-target dim=$dim target-rms=0.5 + output-layer name=output-xent-target dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + # edits.config contains edits required to train transferred model. + # e.g. substitute output-node of previous model with new output + # and removing orphan nodes and components. + cat < $dir/configs/edits.config + remove-output-nodes name=output + remove-output-nodes name=output-xent + rename-node old-name=output-target new-name=output + rename-node old-name=output-xent-target new-name=output-xent + remove-orphans +EOF + steps/nnet3/xconfig_to_configs.py --existing-model $src_mdl \ + --xconfig-file $dir/configs/network.xconfig \ + --edits-config $dir/configs/edits.config \ + --config-dir $dir/configs/ +fi + +if [ $stage -le 8 ]; then + echo "$0: generate egs for chain to train new model on rm dataset." + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/rm-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + echo "$0: set the learning-rate-factor for initial network to be zero." + nnet3-am-copy --raw=true --edits="set-learning-rate-factor name=* learning-rate-factor=$primary_lr_factor" \ + $src_mdl $dir/init.raw || exit 1; + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet2${nnet_affix}/ivectors \ + --chain.xent-regularize $xent_regularize \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize 0.1 \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=200" \ + --egs.dir "$common_egs_dir" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch=$minibatch_size \ + --trainer.frames-per-iter 1000000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial=$num_jobs_initial \ + --trainer.optimization.num-jobs-final=$num_jobs_final \ + --trainer.optimization.initial-effective-lrate=$initial_effective_lrate \ + --trainer.optimization.final-effective-lrate=$final_effective_lrate \ + --trainer.max-param-change $max_param_change \ + --cleanup.remove-egs false \ + --feat-dir data/train_hires \ + --tree-dir $treedir \ + --lat-dir exp/tri3b_lats \ + --dir $dir || exit 1; +fi + +if [ $stage -le 9 ]; then + steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 4 \ + data/test $srcdir/exp/nnet3/extractor exp/nnet2${nnet_affix}/ivectors_test || exit 1; +fi + +if [ $stage -le 10 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --scoring-opts "--min-lmwt 1" \ + --nj 20 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet2${nnet_affix}/ivectors_test \ + $dir/graph data/test_hires $dir/decode || exit 1; +fi + +if [ $stage -le 11 ]; then + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_ug $dir $dir/graph_ug + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 20 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet2${nnet_affix}/ivectors_test \ + $dir/graph_ug data/test_hires $dir/decode_ug || exit 1; +fi +wait; +exit 0; diff --git a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1a.sh b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1a.sh new file mode 100755 index 00000000000..e232bd39e46 --- /dev/null +++ b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1a.sh @@ -0,0 +1,192 @@ +#!/bin/bash + +# This script uses weight transfer as a Transfer learning method +# and use already trained model on wsj and remove the last layer and +# add new randomly initialized layer and retrain the whole network. +# while training new added layer using rm data. +# The chain config is as run_tdnn_5n.sh and the result is: +#System tdnn_5n tdnn_wsj_rm +#WER 2.71 2.21 +set -e + +# configs for 'chain' +stage=0 +train_stage=-10 +get_egs_stage=-10 +dir=exp/chain/tdnn_wsj_rm_1a + +# training options +num_epochs=2 +initial_effective_lrate=0.005 +final_effective_lrate=0.0005 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=2 +num_jobs_final=4 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false +xent_regularize=0.1 + +# configs for transfer learning +srcdir=../../wsj/s5/ +common_egs_dir=exp/chain/tdnn_wsj_rm_1c_fixed_ac_scale/egs +src_mdl=$srcdir/exp/chain/tdnn1d_sp/final.mdl +primary_lr_factor=0.25 +dim=450 +nnet_affix=_online +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 6 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 1200 data/train $lang $ali_dir $treedir +fi + +if [ $stage -le 7 ]; then + echo "$0: creating neural net configs using the xconfig parser for"; + echo "extra layers w.r.t source network."; + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + mkdir -p $dir + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + relu-renorm-layer name=tdnn7-target input=Append(tdnn6.renorm@-3,tdnn6.renorm@0) dim=$dim + ## adding the layers for chain branch + relu-renorm-layer name=prefinal-chain-target input=tdnn7-target dim=$dim target-rms=0.5 + output-layer name=output-target include-log-softmax=false dim=$num_targets max-change=1.5 + relu-renorm-layer name=prefinal-xent-target input=tdnn7-target dim=$dim target-rms=0.5 + output-layer name=output-xent-target dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + # edits.config contains edits required to train transferred model. + # e.g. substitute output-node of previous model with new output + # and removing orphan nodes and components. + cat < $dir/configs/edits.config + remove-output-nodes name=output + remove-output-nodes name=output-xent + rename-node old-name=output-target new-name=output + rename-node old-name=output-xent-target new-name=output-xent + remove-orphans +EOF + steps/nnet3/xconfig_to_configs.py --existing-model $src_mdl \ + --xconfig-file $dir/configs/network.xconfig \ + --edits-config $dir/configs/edits.config \ + --config-dir $dir/configs/ +fi + +if [ $stage -le 8 ]; then + echo "$0: generate egs for chain to train new model on rm dataset." + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/rm-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + echo "$0: set the learning-rate-factor for initial network to be zero." + nnet3-am-copy --raw=true --edits="set-learning-rate-factor name=* learning-rate-factor=$primary_lr_factor" \ + $src_mdl $dir/init.raw || exit 1; + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet2${nnet_affix}/ivectors \ + --chain.xent-regularize $xent_regularize \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize 0.1 \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=200" \ + --egs.dir "$common_egs_dir" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch=$minibatch_size \ + --trainer.frames-per-iter 1000000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial=$num_jobs_initial \ + --trainer.optimization.num-jobs-final=$num_jobs_final \ + --trainer.optimization.initial-effective-lrate=$initial_effective_lrate \ + --trainer.optimization.final-effective-lrate=$final_effective_lrate \ + --trainer.max-param-change $max_param_change \ + --cleanup.remove-egs false \ + --feat-dir data/train_hires \ + --tree-dir $treedir \ + --lat-dir exp/tri3b_lats \ + --dir $dir || exit 1; +fi + +if [ $stage -le 9 ]; then + steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 4 \ + data/test $srcdir/exp/nnet3/extractor exp/nnet2${nnet_affix}/ivectors_test || exit 1; +fi + +if [ $stage -le 10 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --scoring-opts "--min-lmwt 1" \ + --nj 20 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet2${nnet_affix}/ivectors_test \ + $dir/graph data/test_hires $dir/decode || exit 1; +fi + +if [ $stage -le 11 ]; then + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_ug $dir $dir/graph_ug + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 20 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet2${nnet_affix}/ivectors_test \ + $dir/graph_ug data/test_hires $dir/decode_ug || exit 1; +fi +wait; +exit 0; diff --git a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1b.sh b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1b.sh new file mode 100755 index 00000000000..897bd21a788 --- /dev/null +++ b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1b.sh @@ -0,0 +1,211 @@ +#!/bin/bash +# _1b is as _1a but uses a src-tree-dir to generate new target alignment and lattices +# using source model. It also combines +# alignemts from source and target to train phone LM for den.fst in chain denominator graph. + +# This script uses weight transfer as Transfer learning method +# and use already trained model on wsj and remove the last layer and +# add new randomly initialized layer and retrain the whole network. +# while training new added layer using rm data. +# The chain config is as run_tdnn_5n.sh and the result is: +# System tdnn_5n tdnn_wsj_rm_1a tdnn_wsj_rm_1b tdnn_wsj_rm_1c +# WER 2.71 2.09 3.45 3.38 +set -e + +# configs for 'chain' +stage=7 +train_stage=-10 +get_egs_stage=-10 + +# training options +num_epochs=2 +initial_effective_lrate=0.005 +final_effective_lrate=0.0005 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=2 +num_jobs_final=4 +minibatch_size=32 +frames_per_eg=150 +remove_egs=false +xent_regularize=0.1 + +# configs for transfer learning +common_egs_dir= +#srcdir=../../wsj/s5/ +srcdir=/export/a09/pegahgh/kaldi-transfer-learning/egs/wsj/s5-sp +src_mdl=$srcdir/exp/chain/tdnn1d_sp/final.mdl +src_lang=$srcdir/data/lang +src_gmm_mdl=$srcdir/exp/tri4b +src_tree_dir=$srcdir/exp/chain/tree_a_sp # chain tree-dir for src data; + # the alignment in target domain is + # converted using src-tree +primary_lr_factor=0.25 # learning-rate factor for all except last layer in transferring source model +final_lr_factor=1.0 # learning-rate factor for final layer in transferring source model. +nnet_affix=_online_wsj +tg_lm_scale=10 +src_lm_scale=1 +tdnn_affix=_1b +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + output-layer name=output-tmp input=tdnn6.renorm dim=$num_targets +EOF + # edits.config contains edits required to train transferred model. + # e.g. substitute output-node of previous model with new output + # and removing orphan nodes and components. + cat < $dir/configs/edits.config + remove-output-nodes name=output-tmp + remove-orphans +EOF + steps/nnet3/xconfig_to_configs.py --existing-model $src_mdl \ + --xconfig-file $dir/configs/network.xconfig \ + --edits-config $dir/configs/edits.config \ + --config-dir $dir/configs/ +fi + +converted_ali_dir=exp/converted_ali_wsj +if [ $stage -le 8 ]; then + echo "$0: convert target alignment using tree in src-tree-dir" + mkdir -p $converted_ali_dir + mkdir -p $converted_ali_dir/log + num_ali_job=`cat $ali_dir/num_jobs` + cp $ali_dir/num_jobs $converted_ali_dir + cp $src_tree_dir/{tree,final.mdl} $converted_ali_dir + $decode_cmd JOB=1:$num_ali_job $converted_ali_dir/log/convert_ali.JOB.log \ + convert-ali $ali_dir/final.mdl $src_tree_dir/final.mdl \ + $src_tree_dir/tree "ark:gunzip -c $ali_dir/ali.JOB.gz |" \ + "ark:| gzip -c > $converted_ali_dir/ali.JOB.gz" +fi + +if [ $stage -le 9 ]; then + echo "$0: generate egs for chain to train new model on rm dataset." + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/rm-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + echo "$0: set the learning-rate-factor for initial network to be zero." + $decode_cmd $dir/log/copy_mdl.log \ + nnet3-am-copy --raw=true --edits="set-learning-rate-factor name=* learning-rate-factor=$primary_lr_factor; set-learning-rate-factor name=output* learning-rate-factor=$final_lr_factor" \ + $src_mdl $dir/init.raw || exit 1; + + steps/nnet3/chain/train_more.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet2${nnet_affix}/ivectors \ + --chain.xent-regularize $xent_regularize \ + --chain.alignments-for-lm="$converted_ali_dir:$tg_lm_scale,$src_tree_dir:$src_lm_scale" \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize 0.1 \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=200" \ + --egs.dir "$common_egs_dir" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch=$minibatch_size \ + --trainer.frames-per-iter 1000000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial=$num_jobs_initial \ + --trainer.optimization.num-jobs-final=$num_jobs_final \ + --trainer.optimization.initial-effective-lrate=$initial_effective_lrate \ + --trainer.optimization.final-effective-lrate=$final_effective_lrate \ + --trainer.max-param-change $max_param_change \ + --cleanup.remove-egs false \ + --feat-dir data/train_hires \ + --tree-dir $src_tree_dir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +if [ $stage -le 10 ]; then + steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 4 \ + data/test_hires $srcdir/exp/nnet3/extractor exp/nnet2${nnet_affix}/ivectors_test || exit 1; +fi + +if [ $stage -le 11 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 $lang_dir $dir $dir/graph + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --scoring-opts "--min-lmwt 1" \ + --nj 20 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet2${nnet_affix}/ivectors_test \ + $dir/graph data/test_hires $dir/decode || exit 1; +fi + +if [ $stage -le 12 ]; then + utils/mkgraph.sh --self-loop-scale 1.0 $lang_ug_dir $dir $dir/graph_ug + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 20 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet2${nnet_affix}/ivectors_test \ + $dir/graph_ug data/test_hires $dir/decode_ug || exit 1; +fi +wait; +exit 0; diff --git a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1c.sh b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1c.sh new file mode 100755 index 00000000000..5f4aa43d01a --- /dev/null +++ b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1c.sh @@ -0,0 +1,201 @@ +#!/bin/bash +# _1c is as _1b but it uses chain model to generate alignment for RM using SWJ model. +# _1b is as _1a but uses a src-tree-dir to convert target alignment and combine +# alignemts from source and target to train phone LM for den.fst in chain denominator graph. + +# This script uses weight transfer as Transfer learning method +# and use already trained model on wsj and remove the last layer and +# add new randomly initialized layer and retrain the whole network. +# while training new added layer using rm data. +# The chain config is as run_tdnn_5n.sh and the result is: +#System tdnn_5n tdnn_wsj_rm +#WER 2.71 2.21 +set -e + +# configs for 'chain' +stage=8 +train_stage=-10 +get_egs_stage=-10 +dir=exp/chain/tdnn_wsj_rm_1c + +# training options +frames_per_chunk=150 +num_epochs=2 +initial_effective_lrate=0.005 +final_effective_lrate=0.0005 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=2 +num_jobs_final=4 +minibatch_size=32 +frames_per_eg=150 +remove_egs=false +xent_regularize=0.1 + +# configs for transfer learning +common_egs_dir= +srcdir=../../wsj/s5 +src_mdl=$srcdir/exp/chain/tdnn1d_sp/final.mdl +src_lang=$srcdir/data/lang +src_tree_dir=$srcdir/exp/chain/tree_a_sp # chain tree-dir for src data; + # the alignment in target domain is + # converted using src-tree +primary_lr_factor=0.25 # learning-rate factor for all except last layer in transferred source model +final_lr_factor=1.0 # learning-rate factor for final affine layer in transferred source model. +nnet_affix=_online_wsj + +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + output-layer name=output-tmp input=tdnn6.renorm dim=$num_targets +EOF + + cat < $dir/configs/edits.config + remove-output-nodes name=output-tmp + remove-orphans +EOF + steps/nnet3/xconfig_to_configs.py --existing-model $src_mdl \ + --xconfig-file $dir/configs/network.xconfig \ + --edits-config $dir/configs/edits.config \ + --config-dir $dir/configs/ +fi + +if [ $stage -le 7 ]; then + echo "$0: generate egs for chain to train new model on rm dataset." + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/rm-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + echo "$0: set the learning-rate-factor for initial network to be zero." + $decode_cmd $dir/log/copy_mdl.log \ + nnet3-am-copy --raw=true --edits="set-learning-rate-factor name=* learning-rate-factor=$primary_lr_factor; set-learning-rate-factor name=output* learning-rate-factor=$final_lr_factor" \ + $src_mdl $dir/init.raw || exit 1; + + steps/nnet3/chain/train_more.py --stage $train_stage ${chain_opts[@]} \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet2${nnet_affix}/ivectors \ + --chain.xent-regularize $xent_regularize \ + --chain.alignments-for-lm="$ali_dir:10,$src_tree_dir:1" \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize 0.1 \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=200" \ + --egs.dir "$common_egs_dir" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch=$minibatch_size \ + --trainer.frames-per-iter 1000000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial=$num_jobs_initial \ + --trainer.optimization.num-jobs-final=$num_jobs_final \ + --trainer.optimization.initial-effective-lrate=$initial_effective_lrate \ + --trainer.optimization.final-effective-lrate=$final_effective_lrate \ + --trainer.max-param-change $max_param_change \ + --cleanup.remove-egs false \ + --feat-dir data/train_hires \ + --tree-dir $src_tree_dir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +if [ $stage -le 8 ]; then + steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 4 \ + data/test_hires $srcdir/exp/nnet3/extractor exp/nnet2${nnet_affix}/ivectors_test || exit 1; +fi + +if [ $stage -le 9 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 $lang_src_tgt $dir $dir/graph + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --scoring-opts "--min-lmwt 1" \ + --nj 20 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet2${nnet_affix}/ivectors_test \ + $dir/graph data/test_hires $dir/decode || exit 1; +fi + +if [ $stage -le 10 ]; then + utils/mkgraph.sh --self-loop-scale 1.0 ${lang_src_tgt}_ug $dir $dir/graph_ug + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 20 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet2${nnet_affix}/ivectors_test \ + $dir/graph_ug data/test_hires $dir/decode_ug || exit 1; +fi +wait; +exit 0; diff --git a/egs/rm/s5/local/online/run_nnet2_common.sh b/egs/rm/s5/local/online/run_nnet2_common.sh index 27ad7cf9aeb..e1a4676da66 100755 --- a/egs/rm/s5/local/online/run_nnet2_common.sh +++ b/egs/rm/s5/local/online/run_nnet2_common.sh @@ -47,27 +47,28 @@ if [ $stage -le 0 ]; then done fi +train_set=${train_set}_hires if [ ! -f $extractor/final.dubm ]; then if [ $stage -le 1 ]; then mkdir -p exp/nnet2${nnet_affix} - steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 10 --num-frames 200000 \ - data/train 256 exp/tri3b exp/nnet2${nnet_affix}/diag_ubm + steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 40 --num-frames 200000 \ + data/${train_set} 256 exp/tri3b exp/nnet2${nnet_affix}/diag_ubm fi if [ $stage -le 2 ]; then # use a smaller iVector dim (50) than the default (100) because RM has a very # small amount of data. - steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 4 \ + steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 40 \ --ivector-dim $ivector_dim \ - data/train exp/nnet2${nnet_affix}/diag_ubm $extractor || exit 1; + data/${train_set} exp/nnet2${nnet_affix}/diag_ubm $extractor || exit 1; fi fi if [ $stage -le 3 ]; then # having a larger number of speakers is helpful for generalization, and to # handle per-utterance decoding well (iVector starts at zero). - steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2 data/train data/train_max2 + steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2 data/${train_set} data/${train_set}_max2 - steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 4 \ - data/train_max2 $extractor exp/nnet2${nnet_affix}/ivectors || exit 1; + steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 40 \ + data/${train_set}_max2 $extractor exp/nnet2${nnet_affix}/ivectors || exit 1; fi diff --git a/egs/rm/s5/local/prepare_wsj_rm_lang.sh b/egs/rm/s5/local/prepare_wsj_rm_lang.sh new file mode 100755 index 00000000000..56dd351a8d2 --- /dev/null +++ b/egs/rm/s5/local/prepare_wsj_rm_lang.sh @@ -0,0 +1,65 @@ +#!/bin/bash +# Copyright 2017 Pegah Ghahremani + +# This script prepares a dictionary for sourc-target using source phone set, lexicon and dict and +# target words.txt are copied from source lexicon for common words in source +# and target. words in target that are not available in the source lexicon are added +# as oov in lexicon.txt. +# The is also added to words.txt and G.fst is recompiled using +# updated word list. + +. utils/parse_options.sh + +if [ $# != 3 ]; then + echo "Usage: local/prepare_wsj_rm_lang.sh " + echo "e.g:" + echo "$0 ../../wsj/s5/data/local/dict ../../wsj/s5/data/lang/phones.txt data/wsj_rm_dir" +fi + +src_dict=$1 +src_phones=$2 +src_tg_lang=$3 + +required_dict_files="lexicon.txt nonsilence_phones.txt silence_phones.txt optional_silence.txt" +for f in $required_dict_files; do + if [ ! -f $src_dict/$f ]; then + echo "file $src_dict/$f that is required for preparing lang does not exists." && exit 1; + fi +done + +rm -rf $src_tg_lang +mkdir -p $src_tg_lang +mkdir -p $src_tg_lang/local +# copy *phones.txt from source to target. +cp -r $src_dict $src_tg_lang/local/dict +rm $src_tg_lang/local/dict/lexicon.txt + +# common word list in rm lexicon with lexicon in wsj +comm -12 <(awk '{print $1}' data/local/dict/lexicon.txt | sed "s/\+/\'/g" | sort) \ +<(awk '{print $1}' $src_dict/lexicon.txt | sort) | \ +sed -r "s/'/+/g" | sort > $src_tg_lang/words_tmp.txt + +comm -23 <(awk '{print $1}' data/local/dict/lexicon.txt | sed "s/\+/\'/g" | sort) \ +<(awk '{print $1}' $src_dict/lexicon.txt | sort) | \ +sed -r "s/'/+/g" | sort > $src_tg_lang/words_only_tg.txt + +# add to rm_swj_word list +(echo ""; cat $src_tg_lang/words_tmp.txt) | sort > $src_tg_lang/words_tg_src.txt +rm $src_tg_lang/words_tmp.txt + +# we use wsj lexicon and find common word list in rm and wsj to generate lexicon for rm +# using wsj phone sets. More than 90% of words in RM are in WSJ(950/994). +cat $src_tg_lang/words_tg_src.txt | sed "s/\+/\'/g" | \ +utils/apply_map.pl --permissive $src_dict/lexicon.txt | \ +paste <(cat $src_tg_lang/words_tg_src.txt) - > $src_tg_lang/local/dict/lexicon_tg_src.txt + +# extend lexicon.txt by adding only_tg words as oov. +cat $src_tg_lang/local/dict/lexicon_tg_src.txt <(sed 's/$/ SPN/g' $src_tg_lang/words_only_tg.txt) | sort -u > $src_tg_lang/local/dict/lexicon.txt + +# prepare dictionary using new lexicon.txt for RM-SWJ. +utils/prepare_lang.sh --phone-symbol-table $src_phones \ +$src_tg_lang/local/dict "" $src_tg_lang/local/lang_tmp $src_tg_lang + +# Generate new G.fst using updated words list with added +fstcompile --isymbols=$src_tg_lang/words.txt --osymbols=$src_tg_lang/words.txt --keep_isymbols=false \ + -keep_osymbols=false data/local/tmp/G.txt | fstarcsort --sort_type=ilabel > $src_tg_lang/G.fst || exit 1; diff --git a/egs/wsj/s5/steps/align_basis_fmllr.sh b/egs/wsj/s5/steps/align_basis_fmllr.sh index 9536e9e1266..d65986bd9ec 100755 --- a/egs/wsj/s5/steps/align_basis_fmllr.sh +++ b/egs/wsj/s5/steps/align_basis_fmllr.sh @@ -13,7 +13,7 @@ # case the number of jobs must match the source directory. -# Begin configuration section. +# Begin configuration section. stage=0 nj=4 cmd=run.pl @@ -75,7 +75,7 @@ echo "$0: feature type is $feat_type" case $feat_type in delta) sifeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |";; lda) sifeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |" - cp $srcdir/final.mat $dir + cp $srcdir/final.mat $dir ;; *) echo "Invalid feature type $feat_type" && exit 1; esac @@ -88,7 +88,7 @@ else alimdl=$srcdir/final.mdl fi [ ! -f $mdl ] && echo "$0: no such model $mdl" && exit 1; -alimdl_cmd="gmm-boost-silence --boost=$boost_silence `cat $lang/phones/boost_phones.csl` $alimdl - |" +alimdl_cmd="gmm-boost-silence --boost=$boost_silence `cat $lang/phones/optional_silence.csl` $alimdl - |" mdl_cmd="gmm-boost-silence --boost=$boost_silence `cat $lang/phones/optional_silence.csl` $mdl - |" ## Work out where we're getting the graphs from. @@ -101,7 +101,7 @@ else graphdir=$dir if [ $stage -le 0 ]; then echo "$0: compiling training graphs" - tra="ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $sdata/JOB/text|"; + tra="ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $sdata/JOB/text|"; $cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log \ compile-train-graphs --read-disambig-syms=$lang/phones/disambig.int $dir/tree $dir/final.mdl $lang/L.fst "$tra" \ "ark:|gzip -c >$dir/fsts.JOB.gz" || exit 1; @@ -138,7 +138,7 @@ if [ $stage -le 2 ]; then fi fi -feats="$sifeats transform-feats ark:$dir/pre_trans.JOB ark:- ark:- |" +feats="$sifeats transform-feats ark:$dir/trans.JOB ark:- ark:- |" if [ $stage -le 3 ]; then echo "$0: doing final alignment." diff --git a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py index bbb925a9e58..0a2921f1322 100644 --- a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py +++ b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py @@ -12,6 +12,7 @@ import math import os import sys +import filecmp import libs.common as common_lib import libs.nnet3.train.common as common_train_lib @@ -21,31 +22,54 @@ def create_phone_lm(dir, tree_dir, run_opts, lm_opts=None): - """Create a phone LM for chain training + """Create a phone LM for chain training. - This method trains a phone LM for chain training using the alignments + This method trains a weighted phone LM for chain training generated by + weighted combination of counts in alignments in "tree_dir" + "tree_dir" is a list of comma-separated alignments directories with + same tree and phone sets (each containing tree and ali.*.gz) + and the colon-separated integer weight appended + to each alignment dir.(if none, the weight is 1) + i.e. ali_dir1:w1,ali_dir2:w2 """ - try: - f = open(tree_dir + "/num_jobs", 'r') - num_ali_jobs = int(f.readline()) - assert num_ali_jobs > 0 - except: - raise Exception("""There was an error getting the number of alignment - jobs from {0}/num_jobs""".format(tree_dir)) + tree_dirs = tree_dir.split(",") + ali_dirs = [] + scales = [] + for tree_dir in tree_dirs: + ali_and_weights = tree_dir.split(":") + ali_dirs.append(ali_and_weights[0]) + if (len(ali_and_weights) == 2): + scales.append(str(int(ali_and_weights[1]))) + else: + scales.append(1) + phone_lists = [] + for ali_dir in ali_dirs: + try: + f = open(ali_dir + "/num_jobs", 'r') + num_ali_jobs = int(f.readline()) + assert num_ali_jobs > 0 + except: + raise Exception("""There was an error getting the number of alignment + jobs from {0}/num_jobs""".format(ali_dir)) + + assert(filecmp.cmp('{0}/tree'.format(ali_dirs[0]), '{0}/tree'.format(ali_dir))) + + alignments=' '.join(['{0}/ali.{1}.gz'.format(ali_dir, job) + for job in range(1, num_ali_jobs + 1)]) + phone_lists.append("""'ark:gunzip -c {0} | ali-to-phones {1}/final.mdl ark:- ark:-|'""".format(alignments, ali_dir)) - alignments=' '.join(['{0}/ali.{1}.gz'.format(tree_dir, job) - for job in range(1, num_ali_jobs + 1)]) + phone_lists_str=' '.join(phone_lists) + if len(scales) > 0: + scales_str=','.join(scales) + lm_opts="{0} --scales={1}".format(lm_opts, scales_str) common_lib.execute_command( """{command} {dir}/log/make_phone_lm.log \ - gunzip -c {alignments} \| \ - ali-to-phones {tree_dir}/final.mdl ark:- ark:- \| \ - chain-est-phone-lm {lm_opts} ark:- {dir}/phone_lm.fst""".format( + chain-est-phone-lm {lm_opts} {phone_lists} {dir}/phone_lm.fst""".format( command=run_opts.command, dir=dir, - alignments=alignments, lm_opts=lm_opts if lm_opts is not None else '', - tree_dir=tree_dir)) + phone_lists=phone_lists_str)) def create_denominator_fst(dir, tree_dir, run_opts): @@ -355,7 +379,7 @@ def check_for_required_files(feat_dir, tree_dir, lat_dir): files = ['{0}/feats.scp'.format(feat_dir), '{0}/ali.1.gz'.format(tree_dir), '{0}/final.mdl'.format(tree_dir), '{0}/tree'.format(tree_dir), '{0}/lat.1.gz'.format(lat_dir), '{0}/final.mdl'.format(lat_dir), - '{0}/num_jobs'.format(lat_dir), '{0}/splice_opts'.format(lat_dir)] + '{0}/num_jobs'.format(lat_dir)] for file in files: if not os.path.isfile(file): raise Exception('Expected {0} to exist.'.format(file)) diff --git a/egs/wsj/s5/steps/nnet3/chain/train.py b/egs/wsj/s5/steps/nnet3/chain/train.py index c44e263550d..e599c981f94 100755 --- a/egs/wsj/s5/steps/nnet3/chain/train.py +++ b/egs/wsj/s5/steps/nnet3/chain/train.py @@ -145,19 +145,6 @@ def get_args(): steps/nnet3/get_saturation.pl) exceeds this threshold we scale the parameter matrices with the shrink-value.""") - parser.add_argument("--trainer.optimization.proportional-shrink", type=float, - dest='proportional_shrink', default=0.0, - help="""If nonzero, this will set a shrinkage (scaling) - factor for the parameters, whose value is set as: - shrink-value=(1.0 - proportional-shrink * learning-rate), where - 'learning-rate' is the learning rate being applied - on the current iteration, which will vary from - initial-effective-lrate*num-jobs-initial to - final-effective-lrate*num-jobs-final. - Unlike for train_rnn.py, this is applied unconditionally, - it does not depend on saturation of nonlinearities. - Can be used to roughly approximate l2 regularization.""") - # RNN-specific training options parser.add_argument("--trainer.deriv-truncate-margin", type=int, dest='deriv_truncate_margin', default=None, @@ -224,7 +211,8 @@ def process_args(args): or not os.path.exists(args.dir+"/configs")): raise Exception("This scripts expects {0} to exist and have a configs " "directory which is the output of " - "make_configs.py script") + "make_configs.py script".format( + args.dir)) if args.transform_dir is None: args.transform_dir = args.lat_dir @@ -418,14 +406,6 @@ def train(args, run_opts): num_archives_expanded, args.max_models_combine, args.num_jobs_final) - def learning_rate(iter, current_num_jobs, num_archives_processed): - return common_train_lib.get_learning_rate(iter, current_num_jobs, - num_iters, - num_archives_processed, - num_archives_to_process, - args.initial_effective_lrate, - args.final_effective_lrate) - min_deriv_time = None max_deriv_time_relative = None if args.deriv_truncate_margin is not None: @@ -447,22 +427,23 @@ def learning_rate(iter, current_num_jobs, num_archives_processed): if args.stage <= iter: model_file = "{dir}/{iter}.mdl".format(dir=args.dir, iter=iter) - lrate = learning_rate(iter, current_num_jobs, - num_archives_processed) - shrink_value = 1.0 - if args.proportional_shrink != 0.0: - shrink_value = 1.0 - (args.proportional_shrink * lrate) - if shrink_value <= 0.5: - raise Exception("proportional-shrink={0} is too large, it gives " - "shrink-value={1}".format(args.proportional_shrink, - shrink_value)) - - if args.shrink_value < shrink_value: - shrink_value = (args.shrink_value + lrate = common_train_lib.get_learning_rate(iter, current_num_jobs, + num_iters, + num_archives_processed, + num_archives_to_process, + args.initial_effective_lrate, + args.final_effective_lrate) + shrinkage_value = 1.0 - (args.proportional_shrink * lrate) + if shrinkage_value <= 0.5: + raise Exception("proportional-shrink={0} is too large, it gives " + "shrink-value={1}".format(args.proportional_shrink, + shrinkage_value)) + if args.shrink_value < shrinkage_value: + shrinkage_value = (args.shrink_value if common_train_lib.should_do_shrinkage( iter, model_file, args.shrink_saturation_threshold) - else shrink_value) + else shrinkage_value) chain_lib.train_one_iteration( dir=args.dir, @@ -477,7 +458,7 @@ def learning_rate(iter, current_num_jobs, num_archives_processed): args.dropout_schedule, float(num_archives_processed) / num_archives_to_process, iter), - shrinkage_value=shrink_value, + shrinkage_value=shrinkage_value, num_chunk_per_minibatch_str=args.num_chunk_per_minibatch, apply_deriv_weights=args.apply_deriv_weights, min_deriv_time=min_deriv_time, @@ -489,7 +470,9 @@ def learning_rate(iter, current_num_jobs, num_archives_processed): max_param_change=args.max_param_change, shuffle_buffer_size=args.shuffle_buffer_size, frame_subsampling_factor=args.frame_subsampling_factor, - run_opts=run_opts) + run_opts=run_opts, + backstitch_training_scale=args.backstitch_training_scale, + backstitch_training_interval=args.backstitch_training_interval) if args.cleanup: # do a clean up everythin but the last 2 models, under certain diff --git a/egs/wsj/s5/steps/nnet3/chain/train_more.py b/egs/wsj/s5/steps/nnet3/chain/train_more.py new file mode 100755 index 00000000000..3577719aed3 --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/chain/train_more.py @@ -0,0 +1,590 @@ +#!/usr/bin/env python + +# Copyright 2016 Vijayaditya Peddinti. +# 2016 Vimal Manohar +# Apache 2.0. + +""" This script is based on steps/nnet3/chain/train.sh +""" + +import argparse +import logging +import os +import pprint +import shutil +import sys +import traceback + +sys.path.insert(0, 'steps') +import libs.nnet3.train.common as common_train_lib +import libs.common as common_lib +import libs.nnet3.train.chain_objf.acoustic_model as chain_lib +import libs.nnet3.report.log_parse as nnet3_log_parse + + +logger = logging.getLogger('libs') +logger.setLevel(logging.INFO) +handler = logging.StreamHandler() +handler.setLevel(logging.INFO) +formatter = logging.Formatter("%(asctime)s [%(pathname)s:%(lineno)s - " + "%(funcName)s - %(levelname)s ] %(message)s") +handler.setFormatter(formatter) +logger.addHandler(handler) +logger.info('Starting chain model trainer (train.py)') + + +def get_args(): + """ Get args from stdin. + + We add compulsary arguments as named arguments for readability + + The common options are defined in the object + libs.nnet3.train.common.CommonParser.parser. + See steps/libs/nnet3/train/common.py + """ + + parser = argparse.ArgumentParser( + description="""Trains RNN and DNN acoustic models using the 'chain' + objective function.""", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + conflict_handler='resolve', + parents=[common_train_lib.CommonParser().parser]) + + # egs extraction options + parser.add_argument("--egs.chunk-width", type=str, dest='chunk_width', + default="20", + help="""Number of frames per chunk in the examples + used to train the RNN. Caution: if you double this you + should halve --trainer.samples-per-iter. May be + a comma-separated list of alternatives: first width + is the 'principal' chunk-width, used preferentially""") + + # chain options + parser.add_argument("--chain.lm-opts", type=str, dest='lm_opts', + default=None, action=common_lib.NullstrToNoneAction, + help="options to be be passed to chain-est-phone-lm") + parser.add_argument("--chain.alignments-for-lm", type=str, + dest='alignments_for_lm', default=None, + action=common_lib.NullstrToNoneAction, + help="""Comma-separated list of alignment Directories + containing ali.*.gz and their integer-valued weights + (separated with colon), used to + generate weighted phone language model for + denominator FST. The phone sets should be similar + for all alignment dirs and tree-dir. + If empty, alignments in tree-dir used for phone LM + generation. + e.g. "src1/ali_dir:10.0,src2/ali_dir:2.0" + """) + parser.add_argument("--chain.l2-regularize", type=float, + dest='l2_regularize', default=0.0, + help="""Weight of regularization function which is the + l2-norm of the output of the network. It should be used + without the log-softmax layer for the outputs. As + l2-norm of the log-softmax outputs can dominate the + objective function.""") + parser.add_argument("--chain.xent-regularize", type=float, + dest='xent_regularize', default=0.0, + help="Weight of regularization function which is the " + "cross-entropy cost the outputs.") + parser.add_argument("--chain.right-tolerance", type=int, + dest='right_tolerance', default=5, help="") + parser.add_argument("--chain.left-tolerance", type=int, + dest='left_tolerance', default=5, help="") + parser.add_argument("--chain.leaky-hmm-coefficient", type=float, + dest='leaky_hmm_coefficient', default=0.00001, + help="") + parser.add_argument("--chain.apply-deriv-weights", type=str, + dest='apply_deriv_weights', default=True, + action=common_lib.StrToBoolAction, + choices=["true", "false"], + help="") + parser.add_argument("--chain.frame-subsampling-factor", type=int, + dest='frame_subsampling_factor', default=3, + help="ratio of frames-per-second of features we " + "train on, to chain model's output") + parser.add_argument("--chain.alignment-subsampling-factor", type=int, + dest='alignment_subsampling_factor', + default=3, + help="ratio of frames-per-second of input " + "alignments to chain model's output") + parser.add_argument("--chain.left-deriv-truncate", type=int, + dest='left_deriv_truncate', + default=None, + help="Deprecated. Kept for back compatibility") + + # trainer options + parser.add_argument("--trainer.num-epochs", type=float, dest='num_epochs', + default=10.0, + help="Number of epochs to train the model") + parser.add_argument("--trainer.frames-per-iter", type=int, + dest='frames_per_iter', default=800000, + help="""Each iteration of training, see this many + [input] frames per job. This option is passed to + get_egs.sh. Aim for about a minute of training + time""") + + parser.add_argument("--trainer.num-chunk-per-minibatch", type=str, + dest='num_chunk_per_minibatch', default='128', + help="""Number of sequences to be processed in + parallel every minibatch. May be a more general + rule as accepted by the --minibatch-size option of + nnet3-merge-egs; run that program without args to see + the format.""") + + # Parameters for the optimization + parser.add_argument("--trainer.optimization.initial-effective-lrate", + type=float, dest='initial_effective_lrate', + default=0.0002, + help="Learning rate used during the initial iteration") + parser.add_argument("--trainer.optimization.final-effective-lrate", + type=float, dest='final_effective_lrate', + default=0.00002, + help="Learning rate used during the final iteration") + parser.add_argument("--trainer.optimization.shrink-value", type=float, + dest='shrink_value', default=1.0, + help="""Scaling factor used for scaling the parameter + matrices when the derivative averages are below the + shrink-threshold at the non-linearities. E.g. 0.99. + Only applicable when the neural net contains sigmoid or + tanh units.""") + parser.add_argument("--trainer.optimization.shrink-saturation-threshold", + type=float, + dest='shrink_saturation_threshold', default=0.40, + help="""Threshold that controls when we apply the + 'shrinkage' (i.e. scaling by shrink-value). If the + saturation of the sigmoid and tanh nonlinearities in + the neural net (as measured by + steps/nnet3/get_saturation.pl) exceeds this threshold + we scale the parameter matrices with the + shrink-value.""") + parser.add_argument("--trainer.optimization.proportional-shrink", type=float, + dest='proportional_shrink', default=0.0, + help="""If nonzero, this will set a shrinkage (scaling) + factor for the parameters, whose value is set as: + shrink-value=(1.0 - proportional-shrink * learning-rate), where + 'learning-rate' is the learning rate being applied + on the current iteration, which will vary from + initial-effective-lrate*num-jobs-initial to + final-effective-lrate*num-jobs-final. + Unlike for train_rnn.py, this is applied unconditionally, + it does not depend on saturation of nonlinearities. + Can be used to roughly approximate l2 regularization.""") + + # RNN-specific training options + parser.add_argument("--trainer.deriv-truncate-margin", type=int, + dest='deriv_truncate_margin', default=None, + help="""(Relevant only for recurrent models). If + specified, gives the margin (in input frames) around + the 'required' part of each chunk that the derivatives + are backpropagated to. If unset, the derivatives are + backpropagated all the way to the boundaries of the + input data. E.g. 8 is a reasonable setting. Note: the + 'required' part of the chunk is defined by the model's + {left,right}-context.""") + # General options + parser.add_argument("--feat-dir", type=str, required=True, + help="Directory with features used for training " + "the neural network.") + parser.add_argument("--tree-dir", type=str, required=True, + help="""Directory containing the tree to use for this + model (we also expect final.mdl and ali.*.gz in that + directory""") + parser.add_argument("--lat-dir", type=str, required=True, + help="Directory with numerator lattices " + "used for training the neural network.") + parser.add_argument("--dir", type=str, required=True, + help="Directory to store the models and " + "all other files.") + + print(' '.join(sys.argv)) + print(sys.argv) + + args = parser.parse_args() + + [args, run_opts] = process_args(args) + + return [args, run_opts] + + +def process_args(args): + """ Process the options got from get_args() + """ + + if not common_train_lib.validate_chunk_width(args.chunk_width): + raise Exception("--egs.chunk-width has an invalid value"); + + if not common_train_lib.validate_minibatch_size_str(args.num_chunk_per_minibatch): + raise Exception("--trainer.num-chunk-per-minibatch has an invalid value"); + + if args.chunk_left_context < 0: + raise Exception("--egs.chunk-left-context should be non-negative") + + if args.chunk_right_context < 0: + raise Exception("--egs.chunk-right-context should be non-negative") + + if args.left_deriv_truncate is not None: + args.deriv_truncate_margin = -args.left_deriv_truncate + logger.warning( + "--chain.left-deriv-truncate (deprecated) is set by user, and " + "--trainer.deriv-truncate-margin is set to negative of that " + "value={0}. We recommend using the option " + "--trainer.deriv-truncate-margin.".format( + args.deriv_truncate_margin)) + + if (not os.path.exists(args.dir) + or not os.path.exists(args.dir+"/configs")): + raise Exception("This scripts expects {0} to exist and have a configs " + "directory which is the output of " + "make_configs.py script") + + if args.transform_dir is None: + args.transform_dir = args.lat_dir + # set the options corresponding to args.use_gpu + run_opts = common_train_lib.RunOpts() + if args.use_gpu: + if not common_lib.check_if_cuda_compiled(): + logger.warning( + """You are running with one thread but you have not compiled + for CUDA. You may be running a setup optimized for GPUs. + If you have GPUs and have nvcc installed, go to src/ and do + ./configure; make""") + + run_opts.train_queue_opt = "--gpu 1" + run_opts.parallel_train_opts = "" + run_opts.combine_queue_opt = "--gpu 1" + + else: + logger.warning("Without using a GPU this will be very slow. " + "nnet3 does not yet support multiple threads.") + + run_opts.train_queue_opt = "" + run_opts.parallel_train_opts = "--use-gpu=no" + run_opts.combine_queue_opt = "" + + run_opts.command = args.command + run_opts.egs_command = (args.egs_command + if args.egs_command is not None else + args.command) + + return [args, run_opts] + + +def train(args, run_opts): + """ The main function for training. + + Args: + args: a Namespace object with the required parameters + obtained from the function process_args() + run_opts: RunOpts object obtained from the process_args() + """ + + arg_string = pprint.pformat(vars(args)) + logger.info("Arguments for the experiment\n{0}".format(arg_string)) + + # Check files + chain_lib.check_for_required_files(args.feat_dir, args.tree_dir, + args.lat_dir) + + # Set some variables. + num_jobs = 4 #common_lib.get_number_of_jobs(args.lat_dir) + feat_dim = common_lib.get_feat_dim(args.feat_dir) + ivector_dim = common_lib.get_ivector_dim(args.online_ivector_dir) + ivector_id = common_lib.get_ivector_extractor_id(args.online_ivector_dir) + + # split the training data into parts for individual jobs + # we will use the same number of jobs as that used for alignment + common_lib.execute_command("utils/split_data.sh {0} {1}".format( + args.feat_dir, num_jobs)) + shutil.copy('{0}/tree'.format(args.tree_dir), args.dir) + with open('{0}/num_jobs'.format(args.dir), 'w') as f: + f.write(str(num_jobs)) + + config_dir = '{0}/configs'.format(args.dir) + var_file = '{0}/vars'.format(config_dir) + + variables = common_train_lib.parse_generic_config_vars_file(var_file) + + # Set some variables. + try: + model_left_context = variables['model_left_context'] + model_right_context = variables['model_right_context'] + except KeyError as e: + raise Exception("KeyError {0}: Variables need to be defined in " + "{1}".format(str(e), '{0}/configs'.format(args.dir))) + + left_context = args.chunk_left_context + model_left_context + right_context = args.chunk_right_context + model_right_context + left_context_initial = (args.chunk_left_context_initial + model_left_context if + args.chunk_left_context_initial >= 0 else -1) + right_context_final = (args.chunk_right_context_final + model_right_context if + args.chunk_right_context_final >= 0 else -1) + + # Initialize as "raw" nnet, prior to training the LDA-like preconditioning + # matrix. This first config just does any initial splicing that we do; + # we do this as it's a convenient way to get the stats for the 'lda-like' + # transform. + if (args.stage <= -6): + logger.info("Creating phone language-model") + chain_lib.create_phone_lm(args.dir, + (args.tree_dir if args.alignments_for_lm is None + else args.alignments_for_lm), run_opts, + lm_opts=args.lm_opts) + + if (args.stage <= -5): + logger.info("Creating denominator FST") + chain_lib.create_denominator_fst(args.dir, args.tree_dir, run_opts) + + if (args.stage <= -4) and os.path.exists(args.dir+"/configs/init.config"): + logger.info("Initializing a basic network for estimating " + "preconditioning matrix") + common_lib.execute_command( + """{command} {dir}/log/nnet_init.log \ + nnet3-init --srand=-2 {dir}/configs/init.config \ + {dir}/init.raw""".format(command=run_opts.command, + dir=args.dir)) + + egs_left_context = left_context + args.frame_subsampling_factor / 2 + egs_right_context = right_context + args.frame_subsampling_factor / 2 + # note: the '+ args.frame_subsampling_factor / 2' is to allow for the + # fact that we'll be shifting the data slightly during training to give + # variety to the training data. + egs_left_context_initial = (left_context_initial + args.frame_subsampling_factor / 2 if + left_context_initial >= 0 else -1) + egs_right_context_final = (right_context_final + args.frame_subsampling_factor / 2 if + right_context_final >= 0 else -1) + + default_egs_dir = '{0}/egs'.format(args.dir) + if (args.stage <= -3) and args.egs_dir is None: + logger.info("Generating egs") + # this is where get_egs.sh is called. + chain_lib.generate_chain_egs( + dir=args.dir, data=args.feat_dir, + lat_dir=args.lat_dir, egs_dir=default_egs_dir, + left_context=egs_left_context, + right_context=egs_right_context, + left_context_initial=egs_left_context_initial, + right_context_final=egs_right_context_final, + run_opts=run_opts, + left_tolerance=args.left_tolerance, + right_tolerance=args.right_tolerance, + frame_subsampling_factor=args.frame_subsampling_factor, + alignment_subsampling_factor=args.alignment_subsampling_factor, + frames_per_eg_str=args.chunk_width, + srand=args.srand, + egs_opts=args.egs_opts, + cmvn_opts=args.cmvn_opts, + online_ivector_dir=args.online_ivector_dir, + frames_per_iter=args.frames_per_iter, + transform_dir=args.transform_dir, + stage=args.egs_stage) + + if args.egs_dir is None: + egs_dir = default_egs_dir + else: + egs_dir = args.egs_dir + + [egs_left_context, egs_right_context, + frames_per_eg_str, num_archives] = ( + common_train_lib.verify_egs_dir(egs_dir, feat_dim, + ivector_dim, ivector_id, + egs_left_context, egs_right_context, + egs_left_context_initial, + egs_right_context_final)) + assert(os.path.getctime('{0}/cegs.1.ark'.format(egs_dir)) > + os.path.getctime('{0}/phone_lm.fst'.format(args.dir))) + assert(args.chunk_width == frames_per_eg_str) + num_archives_expanded = num_archives * args.frame_subsampling_factor + + if (args.num_jobs_final > num_archives_expanded): + raise Exception('num_jobs_final cannot exceed the ' + 'expanded number of archives') + + # copy the properties of the egs to dir for + # use during decoding + logger.info("Copying the properties from {0} to {1}".format(egs_dir, args.dir)) + common_train_lib.copy_egs_properties_to_exp_dir(egs_dir, args.dir) + + if (args.stage <= -2) and os.path.exists(args.dir+"/configs/init.config"): + logger.info('Computing the preconditioning matrix for input features') + + chain_lib.compute_preconditioning_matrix( + args.dir, egs_dir, num_archives, run_opts, + max_lda_jobs=args.max_lda_jobs, + rand_prune=args.rand_prune) + + if (args.stage <= -1): + logger.info("Preparing the initial acoustic model.") + chain_lib.prepare_initial_acoustic_model(args.dir, run_opts) + + with open("{0}/frame_subsampling_factor".format(args.dir), "w") as f: + f.write(str(args.frame_subsampling_factor)) + + # set num_iters so that as close as possible, we process the data + # $num_epochs times, i.e. $num_iters*$avg_num_jobs) == + # $num_epochs*$num_archives, where + # avg_num_jobs=(num_jobs_initial+num_jobs_final)/2. + num_archives_to_process = int(args.num_epochs * num_archives_expanded) + num_archives_processed = 0 + num_iters = ((num_archives_to_process * 2) + / (args.num_jobs_initial + args.num_jobs_final)) + + models_to_combine = common_train_lib.get_model_combine_iters( + num_iters, args.num_epochs, + num_archives_expanded, args.max_models_combine, + args.num_jobs_final) + + def learning_rate(iter, current_num_jobs, num_archives_processed): + return common_train_lib.get_learning_rate(iter, current_num_jobs, + num_iters, + num_archives_processed, + num_archives_to_process, + args.initial_effective_lrate, + args.final_effective_lrate) + + min_deriv_time = None + max_deriv_time_relative = None + if args.deriv_truncate_margin is not None: + min_deriv_time = -args.deriv_truncate_margin - model_left_context + max_deriv_time_relative = \ + args.deriv_truncate_margin + model_right_context + + logger.info("Training will run for {0} epochs = " + "{1} iterations".format(args.num_epochs, num_iters)) + + for iter in range(num_iters): + if (args.exit_stage is not None) and (iter == args.exit_stage): + logger.info("Exiting early due to --exit-stage {0}".format(iter)) + return + current_num_jobs = int(0.5 + args.num_jobs_initial + + (args.num_jobs_final - args.num_jobs_initial) + * float(iter) / num_iters) + + if args.stage <= iter: + model_file = "{dir}/{iter}.mdl".format(dir=args.dir, iter=iter) + + lrate = learning_rate(iter, current_num_jobs, + num_archives_processed) + shrink_value = 1.0 + if args.proportional_shrink != 0.0: + shrink_value = 1.0 - (args.proportional_shrink * lrate) + if shrink_value <= 0.5: + raise Exception("proportional-shrink={0} is too large, it gives " + "shrink-value={1}".format(args.proportional_shrink, + shrink_value)) + + if args.shrink_value < shrink_value: + shrink_value = (args.shrink_value + if common_train_lib.should_do_shrinkage( + iter, model_file, + args.shrink_saturation_threshold) + else shrink_value) + + chain_lib.train_one_iteration( + dir=args.dir, + iter=iter, + srand=args.srand, + egs_dir=egs_dir, + num_jobs=current_num_jobs, + num_archives_processed=num_archives_processed, + num_archives=num_archives, + learning_rate=lrate, + dropout_edit_string=common_train_lib.get_dropout_edit_string( + args.dropout_schedule, + float(num_archives_processed) / num_archives_to_process, + iter), + shrinkage_value=shrink_value, + num_chunk_per_minibatch_str=args.num_chunk_per_minibatch, + apply_deriv_weights=args.apply_deriv_weights, + min_deriv_time=min_deriv_time, + max_deriv_time_relative=max_deriv_time_relative, + l2_regularize=args.l2_regularize, + xent_regularize=args.xent_regularize, + leaky_hmm_coefficient=args.leaky_hmm_coefficient, + momentum=args.momentum, + max_param_change=args.max_param_change, + shuffle_buffer_size=args.shuffle_buffer_size, + frame_subsampling_factor=args.frame_subsampling_factor, + run_opts=run_opts) + + if args.cleanup: + # do a clean up everythin but the last 2 models, under certain + # conditions + common_train_lib.remove_model( + args.dir, iter-2, num_iters, models_to_combine, + args.preserve_model_interval) + + if args.email is not None: + reporting_iter_interval = num_iters * args.reporting_interval + if iter % reporting_iter_interval == 0: + # lets do some reporting + [report, times, data] = ( + nnet3_log_parse.generate_acc_logprob_report( + args.dir, "log-probability")) + message = report + subject = ("Update : Expt {dir} : " + "Iter {iter}".format(dir=args.dir, iter=iter)) + common_lib.send_mail(message, subject, args.email) + + num_archives_processed = num_archives_processed + current_num_jobs + + if args.stage <= num_iters: + logger.info("Doing final combination to produce final.mdl") + chain_lib.combine_models( + dir=args.dir, num_iters=num_iters, + models_to_combine=models_to_combine, + num_chunk_per_minibatch_str=args.num_chunk_per_minibatch, + egs_dir=egs_dir, + leaky_hmm_coefficient=args.leaky_hmm_coefficient, + l2_regularize=args.l2_regularize, + xent_regularize=args.xent_regularize, + run_opts=run_opts, + sum_to_one_penalty=args.combine_sum_to_one_penalty) + + + if args.cleanup: + logger.info("Cleaning up the experiment directory " + "{0}".format(args.dir)) + remove_egs = args.remove_egs + if args.egs_dir is not None: + # this egs_dir was not created by this experiment so we will not + # delete it + remove_egs = False + + common_train_lib.clean_nnet_dir( + args.dir, num_iters, egs_dir, + preserve_model_interval=args.preserve_model_interval, + remove_egs=remove_egs) + + # do some reporting + [report, times, data] = nnet3_log_parse.generate_acc_logprob_report( + args.dir, "log-probability") + if args.email is not None: + common_lib.send_mail(report, "Update : Expt {0} : " + "complete".format(args.dir), args.email) + + with open("{dir}/accuracy.report".format(dir=args.dir), "w") as f: + f.write(report) + + common_lib.execute_command("steps/info/nnet3_dir_info.pl " + "{0}".format(args.dir)) + + +def main(): + [args, run_opts] = get_args() + try: + train(args, run_opts) + common_lib.wait_for_background_commands() + except BaseException as e: + # look for BaseException so we catch KeyboardInterrupt, which is + # what we get when a background thread dies. + if args.email is not None: + message = ("Training session for experiment {dir} " + "died due to an error.".format(dir=args.dir)) + common_lib.send_mail(message, message, args.email) + if not isinstance(e, KeyboardInterrupt): + traceback.print_exc() + sys.exit(1) + +if __name__ == "__main__": + main() diff --git a/src/bin/Makefile b/src/bin/Makefile index d9f8d3d27ae..bb872a9222c 100644 --- a/src/bin/Makefile +++ b/src/bin/Makefile @@ -32,7 +32,7 @@ OBJFILES = ADDLIBS = ../decoder/kaldi-decoder.a ../lat/kaldi-lat.a ../lm/kaldi-lm.a \ ../fstext/kaldi-fstext.a ../hmm/kaldi-hmm.a \ ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \ - ../tree/kaldi-tree.a ../util/kaldi-util.a ../thread/kaldi-thread.a \ + ../tree/kaldi-tree.a ../util/kaldi-util.a \ ../matrix/kaldi-matrix.a ../base/kaldi-base.a diff --git a/src/chain/language-model.cc b/src/chain/language-model.cc index 41e06116ea8..892ab30958c 100644 --- a/src/chain/language-model.cc +++ b/src/chain/language-model.cc @@ -26,7 +26,8 @@ namespace kaldi { namespace chain { -void LanguageModelEstimator::AddCounts(const std::vector &sentence) { +void LanguageModelEstimator::AddCounts(const std::vector &sentence, + int32 weight) { KALDI_ASSERT(opts_.ngram_order >= 2 && "--ngram-order must be >= 2"); KALDI_ASSERT(opts_.ngram_order >= opts_.no_prune_ngram_order); int32 order = opts_.ngram_order; @@ -36,23 +37,23 @@ void LanguageModelEstimator::AddCounts(const std::vector &sentence) { end = sentence.end(); for (; iter != end; ++iter) { KALDI_ASSERT(*iter != 0); - IncrementCount(history, *iter); + IncrementCount(history, *iter, weight); history.push_back(*iter); if (history.size() >= order) history.erase(history.begin()); } // Probability of end of sentence. This will end up getting ignored later, but // it still makes a difference for probability-normalization reasons. - IncrementCount(history, 0); + IncrementCount(history, 0, weight); } void LanguageModelEstimator::IncrementCount(const std::vector &history, - int32 next_phone) { + int32 next_phone, int32 weight) { int32 lm_state_index = FindOrCreateLmStateIndexForHistory(history); if (lm_states_[lm_state_index].tot_count == 0) { num_active_lm_states_++; } - lm_states_[lm_state_index].AddCount(next_phone, 1); + lm_states_[lm_state_index].AddCount(next_phone, weight); } void LanguageModelEstimator::SetParentCounts() { diff --git a/src/chain/language-model.h b/src/chain/language-model.h index b2c3f4cd746..daeea9395b9 100644 --- a/src/chain/language-model.h +++ b/src/chain/language-model.h @@ -89,9 +89,9 @@ class LanguageModelEstimator { } // Adds counts for this sentence. Basically does: for each n-gram in the - // sentence, count[n-gram] += 1. The only constraint on 'sentence' is that it + // sentence, count[n-gram] += weight. The only constraint on 'sentence' is that it // should contain no zeros. - void AddCounts(const std::vector &sentence); + void AddCounts(const std::vector &sentence, int32 weight); // Estimates the LM and outputs it as an FST. Note: there is // no concept here of backoff arcs. @@ -188,7 +188,7 @@ class LanguageModelEstimator { // adds the counts for this ngram (called from AddCounts()). inline void IncrementCount(const std::vector &history, - int32 next_phone); + int32 next_phone, int32 weight); // Computes whether backoff should be allowed for this lm_state. (the caller diff --git a/src/chainbin/chain-est-phone-lm.cc b/src/chainbin/chain-est-phone-lm.cc index f16b3f4f14b..3b36d81ce8a 100644 --- a/src/chainbin/chain-est-phone-lm.cc +++ b/src/chainbin/chain-est-phone-lm.cc @@ -32,7 +32,7 @@ int main(int argc, char *argv[]) { "Initialize un-smoothed phone language model for 'chain' training\n" "Output in FST format (epsilon-free deterministic acceptor)\n" "\n" - "Usage: chain-est-phone-lm [options] \n" + "Usage: chain-est-phone-lm [options] ... \n" "The phone-sequences are used to train a language model.\n" "e.g.:\n" "gunzip -c input_dir/ali.*.gz | ali-to-phones input_dir/final.mdl ark:- ark:- | \\\n" @@ -40,29 +40,43 @@ int main(int argc, char *argv[]) { bool binary_write = true; LanguageModelOptions lm_opts; + std::string scales_str; ParseOptions po(usage); po.Register("binary", &binary_write, "Write output in binary mode"); + po.Register("scales", &scales_str, "comma-separated list of integer valued scale weights used to scale different phone sequences."); lm_opts.Register(&po); po.Read(argc, argv); - if (po.NumArgs() != 2) { + if (po.NumArgs() < 2) { po.PrintUsage(); exit(1); } - std::string phone_seqs_rspecifier = po.GetArg(1), - lm_fst_wxfilename = po.GetArg(2); + int32 num_seqs = po.NumArgs() - 1; + std::vector scales(num_seqs, 1); + if (!scales_str.empty()) { + SplitStringToIntegers(scales_str, ",", false, &scales); + if (scales.size() != num_seqs) + KALDI_ERR << "--scales should have exactly " + << num_seqs << " scales."; + } + std::string lm_fst_wxfilename = po.GetArg(po.NumArgs()); LanguageModelEstimator lm_estimator(lm_opts); - SequentialInt32VectorReader phones_reader(phone_seqs_rspecifier); - KALDI_LOG << "Reading phone sequences"; - for (; !phones_reader.Done(); phones_reader.Next()) { - const std::vector &phone_seq = phones_reader.Value(); - lm_estimator.AddCounts(phone_seq); + for (int i = 1; i <= num_seqs; i++) { + std::string phone_seqs_rspecifier = po.GetArg(i); + SequentialInt32VectorReader phones_reader(phone_seqs_rspecifier); + KALDI_LOG << "Reading phone sequences"; + for (; !phones_reader.Done(); phones_reader.Next()) { + if (scales[i-1] != 0) { + const std::vector &phone_seq = phones_reader.Value(); + lm_estimator.AddCounts(phone_seq, scales[i-1]); + } + } } KALDI_LOG << "Estimating phone LM"; fst::StdVectorFst fst; From 40c85dc2d310f6c7a947e3fafd0879f5b4d4439c Mon Sep 17 00:00:00 2001 From: Pegita Date: Fri, 14 Jul 2017 18:02:23 -0400 Subject: [PATCH 026/174] updated PR w.r.t comments. --- .../local/chain/tuning/run_tdnn_wsj_rm_1b.sh | 37 ++++++++++--------- .../nnet3/train/chain_objf/acoustic_model.py | 14 ------- egs/wsj/s5/steps/libs/nnet3/train/common.py | 30 +++++++++++++-- egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py | 22 +++++++---- egs/wsj/s5/steps/libs/nnet3/xconfig/utils.py | 2 +- egs/wsj/s5/steps/nnet3/xconfig_to_configs.py | 8 ++-- 6 files changed, 64 insertions(+), 49 deletions(-) diff --git a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1b.sh b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1b.sh index 897bd21a788..5a020e9e0a2 100755 --- a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1b.sh +++ b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1b.sh @@ -105,8 +105,22 @@ if [ $stage -le 5 ]; then rm $lat_dir/fsts.*.gz # save space fi +converted_ali_dir=exp/converted_ali_wsj +if [ $stage -le 6 ]; then + echo "$0: convert target alignment using tree in src-tree-dir" + mkdir -p $converted_ali_dir + mkdir -p $converted_ali_dir/log + num_ali_job=`cat $ali_dir/num_jobs` + cp $ali_dir/num_jobs $converted_ali_dir + cp $src_tree_dir/{tree,final.mdl} $converted_ali_dir + $decode_cmd JOB=1:$num_ali_job $converted_ali_dir/log/convert_ali.JOB.log \ + convert-ali $ali_dir/final.mdl $src_tree_dir/final.mdl \ + $src_tree_dir/tree "ark:gunzip -c $ali_dir/ali.JOB.gz |" \ + "ark:| gzip -c > $converted_ali_dir/ali.JOB.gz" +fi -if [ $stage -le 8 ]; then + +if [ $stage -le 7 ]; then echo "$0: creating neural net configs using the xconfig parser for"; echo "extra layers w.r.t source network."; num_targets=$(tree-info $src_tree_dir/tree |grep num-pdfs|awk '{print $2}') @@ -129,21 +143,8 @@ EOF --config-dir $dir/configs/ fi -converted_ali_dir=exp/converted_ali_wsj -if [ $stage -le 8 ]; then - echo "$0: convert target alignment using tree in src-tree-dir" - mkdir -p $converted_ali_dir - mkdir -p $converted_ali_dir/log - num_ali_job=`cat $ali_dir/num_jobs` - cp $ali_dir/num_jobs $converted_ali_dir - cp $src_tree_dir/{tree,final.mdl} $converted_ali_dir - $decode_cmd JOB=1:$num_ali_job $converted_ali_dir/log/convert_ali.JOB.log \ - convert-ali $ali_dir/final.mdl $src_tree_dir/final.mdl \ - $src_tree_dir/tree "ark:gunzip -c $ali_dir/ali.JOB.gz |" \ - "ark:| gzip -c > $converted_ali_dir/ali.JOB.gz" -fi -if [ $stage -le 9 ]; then +if [ $stage -le 8 ]; then echo "$0: generate egs for chain to train new model on rm dataset." if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then utils/create_split_dir.pl \ @@ -183,12 +184,12 @@ if [ $stage -le 9 ]; then --dir $dir || exit 1; fi -if [ $stage -le 10 ]; then +if [ $stage -le 9 ]; then steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 4 \ data/test_hires $srcdir/exp/nnet3/extractor exp/nnet2${nnet_affix}/ivectors_test || exit 1; fi -if [ $stage -le 11 ]; then +if [ $stage -le 10 ]; then # Note: it might appear that this $lang directory is mismatched, and it is as # far as the 'topo' is concerned, but this script doesn't read the 'topo' from # the lang directory. @@ -200,7 +201,7 @@ if [ $stage -le 11 ]; then $dir/graph data/test_hires $dir/decode || exit 1; fi -if [ $stage -le 12 ]; then +if [ $stage -le 11 ]; then utils/mkgraph.sh --self-loop-scale 1.0 $lang_ug_dir $dir $dir/graph_ug steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ --nj 20 --cmd "$decode_cmd" \ diff --git a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py index 7f0603d0f90..d18c695f662 100644 --- a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py +++ b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py @@ -457,20 +457,6 @@ def prepare_initial_acoustic_model(dir, run_opts, srand=-1): common_train_lib.prepare_initial_network(dir, run_opts, srand=srand) - - # edits 0.raw using edits.config before adding transition model. - edits_config_file = "{0}/configs/edits.config".format(dir) - if os.path.exists(edits_config_file): - logger.info("edits 0.raw model using {0}/configs/edits.config." - "".format(dir)) - common_lib.execute_command( - """{command} {dir}/log/edit.log \ - nnet3-copy --edits-config={edits_config} {dir}/0.raw \ - {dir}/0.raw - """.format(command=run_opts.command, - dir=dir, - edits_config=edits_config_file)) - # The model-format for a 'chain' acoustic model is just the transition # model and then the raw nnet, so we can use 'cat' to create this, as # long as they have the same mode (binary or not binary). diff --git a/egs/wsj/s5/steps/libs/nnet3/train/common.py b/egs/wsj/s5/steps/libs/nnet3/train/common.py index 8898ff67d18..17be05f0b60 100644 --- a/egs/wsj/s5/steps/libs/nnet3/train/common.py +++ b/egs/wsj/s5/steps/libs/nnet3/train/common.py @@ -502,13 +502,34 @@ def smooth_presoftmax_prior_scale_vector(pdf_counts, def prepare_initial_network(dir, run_opts, srand=-3): - if os.path.exists(dir+"/init.raw"): + if not os.path.exists("{0}/configs/init.config".format(dir)): + edits_config_file = "{0}/configs/edits.config".format(dir) common_lib.execute_command( """{command} {dir}/log/add_first_layer.log \ nnet3-init --srand={srand} {dir}/init.raw \ - {dir}/configs/final.config {dir}/0.raw""".format( + {dir}/configs/final.config {dir}/0{edit_suffix}.raw""".format( command=run_opts.command, srand=srand, - dir=dir)) + dir=dir, + edit_suffix = ('.pre-edited' if os.path.exists(edits_config_file) + else ''))) + # Modify 0.pre-edited.raw using edits.config before adding transition model, + # that can be used in transfer learning. + # It is assumed that there is no init.config and 0.raw does not generated + # using init.config and it is copied from another setup. + # An example of edits.config is: + # i.e. remove-output-nodes name=output + # rename-node old-name= new-name=output + if os.path.exists(edits_config_file): + logger.info("edits 0.raw model using {0}/configs/edits.config." + "".format(dir)) + common_lib.execute_command( + """{command} {dir}/log/edit.log \ + nnet3-copy --edits-config={edits_config} {dir}/0.pre-edited.raw \ + {dir}/0.raw + """.format(command=run_opts.command, + dir=dir, + edits_config=edits_config_file)) + else: common_lib.execute_command( """{command} {dir}/log/init_model.log \ @@ -517,6 +538,7 @@ def prepare_initial_network(dir, run_opts, srand=-3): dir=dir)) + def get_model_combine_iters(num_iters, num_epochs, num_archives, max_models_combine, num_jobs_final): @@ -856,7 +878,7 @@ def __init__(self, sequentially.""") self.parser.add_argument("--trainer.optimization.backstitch-training-scale", type=float, dest='backstitch_training_scale', - default=0.0, help="""scale of parameters changes + default=0.0, help="""scale of parameters changes used in backstitch training step.""") self.parser.add_argument("--trainer.optimization.backstitch-training-interval", type=int, dest='backstitch_training_interval', diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py index 8fdb9757379..0e8b850f5e1 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py @@ -82,7 +82,10 @@ def get_model_component_info(model_filename): out = common_lib.get_command_stdout("""nnet3-info {0} | grep '\-node' """ """ """.format(model_filename)) - # out contains all component-nodes used in model_filename + # out contains all {input,component}-nodes used in model_filename + # It can parse lines in out like: + # i.e. input-node name=input dim=40 + # component-node name=tdnn1.affine component=tdnn1.affine input=lda input-dim=300 output-dim=512 layer_names = [] for line in out.split("\n"): parts = line.split(" ") @@ -93,14 +96,15 @@ def get_model_component_info(model_filename): if len(key_value) == 2: key = key_value[0] value = key_value[1] - if key == "name": + if key == "name": # name=** layer_name = value - #layer_name, auxiliary_output = xutils.split_layer_name(value) - elif key == "input-dim": + elif key == "dim": # for input-node input_dim = int(value) - elif key == "output-dim": + elif key == "input-dim": # for component-node + input_dim = int(value) + elif key == "output-dim": # for component-node output_dim = int(value) - elif key == "input": + elif key == "input": # for component-node i.e. input=lda input_str = value if layer_name is not None and layer_name not in layer_names: @@ -109,9 +113,11 @@ def get_model_component_info(model_filename): key_to_value['name'] = layer_name if input_dim != -1: if output_dim == -1: - # The layer is input layer type. + # The layer type is input-node. key_to_value['dim'] = input_dim - elif input_str is not None: + else: + # The layer type is component-node + assert(input_str is not None) key_to_value['dim'] = output_dim all_layers.append(xlayers.XconfigInputLayer('input', key_to_value, all_layers)) if len(all_layers) == 0: diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/utils.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/utils.py index e80a51a85ef..759640e37ed 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/utils.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/utils.py @@ -60,7 +60,7 @@ def get_dim_from_layer_name(all_layers, current_layer, full_layer_name): for layer in all_layers: if layer is current_layer: break - # if "." used in layer name + # if "." used in layer name like tdnn.1 if layer.get_name() == full_layer_name: return layer.output_dim() diff --git a/egs/wsj/s5/steps/nnet3/xconfig_to_configs.py b/egs/wsj/s5/steps/nnet3/xconfig_to_configs.py index e2bcb9390f6..79e9ea5234d 100755 --- a/egs/wsj/s5/steps/nnet3/xconfig_to_configs.py +++ b/egs/wsj/s5/steps/nnet3/xconfig_to_configs.py @@ -31,7 +31,7 @@ def get_args(): parser.add_argument('--existing-model', help='Filename of previously trained neural net ' '(e.g. final.mdl) which is useful in case of ' - 'using list of component-node in already trained model ' + 'using list of component-nodes in already trained model ' 'to generate new config file for new model.' 'e.g. In Transfer learning: generate new model using ' 'nodes in existing model.') @@ -235,9 +235,9 @@ def add_nnet_context_info(config_dir, existing_model=None, model = """ - | nnet3-copy --edits-config={0} - {1}""".format(edits_config, model) common_lib.execute_command("""nnet3-init {0} {1}/ref.config """ - """ {2} """.format(existing_model if - existing_model is not None else "", - config_dir, model)) + """ {2} """.format(existing_model if + existing_model is not None else "", + config_dir, model)) out = common_lib.get_command_stdout("""nnet3-info {0}/ref.raw | """ """head -4""".format(config_dir)) # out looks like this From 39a731f33a21d5b106fcc6641e63b673c9dc09a1 Mon Sep 17 00:00:00 2001 From: Pegita Date: Fri, 14 Jul 2017 18:13:33 -0400 Subject: [PATCH 027/174] small fix to parser.py. --- egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py index 0e8b850f5e1..5449477da0e 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py @@ -67,15 +67,15 @@ def xconfig_line_to_object(config_line, prev_layers = None): "*** {0}".format(config_line)) raise -# This reads raw existing model (*.raw) and returns array of +# This function reads existing model (*.raw or *.mdl) and returns array of # XconfigInputLayer one per input-node or component-node with same 'name' used -# in raw model and 'dim' equal to 'output-dim' for component-node and input-dim for +# in raw model and 'dim' equal to 'output-dim' for component-node and 'dim' for # input-node. def get_model_component_info(model_filename): all_layers = [] try: f = open(model_filename, 'r') - except Exeption as e: + except Exception as e: sys.exit("{0}: error reading model file '{1}'".format(sys.argv[0], model_filename, repr(e))) # use nnet3-info to get component names in the model. From 72480ecb7eed46e37945fd956eaa6dd8231d4be8 Mon Sep 17 00:00:00 2001 From: Pegita Date: Mon, 17 Jul 2017 17:05:54 -0400 Subject: [PATCH 028/174] fixed issues w.r.t. comments (except prepare_wsj_rm_lang.sh). --- .../s5/local/chain/tuning/run_tdnn_wsj_rm.sh | 192 ------------------ .../local/chain/tuning/run_tdnn_wsj_rm_1a.sh | 12 +- .../local/chain/tuning/run_tdnn_wsj_rm_1b.sh | 40 +--- .../local/chain/tuning/run_tdnn_wsj_rm_1c.sh | 11 +- egs/rm/s5/local/prepare_wsj_rm_lang.sh | 59 +++--- .../nnet3/train/chain_objf/acoustic_model.py | 53 +++-- egs/wsj/s5/steps/libs/nnet3/train/common.py | 26 ++- egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py | 19 +- egs/wsj/s5/steps/nnet3/chain/train_more.py | 14 +- egs/wsj/s5/steps/nnet3/train_dnn.py | 3 +- egs/wsj/s5/steps/nnet3/xconfig_to_configs.py | 2 +- src/chainbin/chain-est-phone-lm.cc | 2 +- src/nnet3/nnet-utils.cc | 2 +- 13 files changed, 117 insertions(+), 318 deletions(-) delete mode 100755 egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm.sh diff --git a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm.sh b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm.sh deleted file mode 100755 index 1e78f2b90ec..00000000000 --- a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm.sh +++ /dev/null @@ -1,192 +0,0 @@ -#!/bin/bash - -# This script uses weight transfer as Transfer learning method -# and use already trained model on wsj and remove the last layer and -# add new randomly initialized layer and retrain the whole network. -# while training new added layer using rm data. -# The chain config is as run_tdnn_5n.sh and the result is: -#System tdnn_5n tdnn_wsj_rm -#WER 2.71 2.21 -set -e - -# configs for 'chain' -stage=0 -train_stage=-10 -get_egs_stage=-10 -dir=exp/chain/tdnn_wsj_rm - -# training options -num_epochs=12 -initial_effective_lrate=0.005 -final_effective_lrate=0.0005 -leftmost_questions_truncate=-1 -max_param_change=2.0 -final_layer_normalize_target=0.5 -num_jobs_initial=2 -num_jobs_final=4 -minibatch_size=128 -frames_per_eg=150 -remove_egs=false -xent_regularize=0.1 - -# configs for transfer learning -srcdir=../../wsj/s5/ -common_egs_dir= -src_mdl=$srcdir/exp/chain/tdnn1d_sp/final.mdl -primary_lr_factor=0.25 -dim=450 -nnet_affix=_online -# End configuration section. - -echo "$0 $@" # Print the command line for logging - -. ./cmd.sh -. ./path.sh -. ./utils/parse_options.sh - -if ! cuda-compiled; then - cat <$lang/topo -fi - -if [ $stage -le 6 ]; then - # Build a tree using our new topology. - steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ - --leftmost-questions-truncate $leftmost_questions_truncate \ - --cmd "$train_cmd" 1200 data/train $lang $ali_dir $treedir -fi - -if [ $stage -le 7 ]; then - echo "$0: creating neural net configs using the xconfig parser for"; - echo "extra layers w.r.t source network."; - num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') - learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) - mkdir -p $dir - mkdir -p $dir/configs - cat < $dir/configs/network.xconfig - relu-renorm-layer name=tdnn7-target input=Append(tdnn6.renorm@-3,tdnn6.renorm@0) dim=$dim - ## adding the layers for chain branch - relu-renorm-layer name=prefinal-chain-target input=tdnn7-target dim=$dim target-rms=0.5 - output-layer name=output-target include-log-softmax=false dim=$num_targets max-change=1.5 - relu-renorm-layer name=prefinal-xent-target input=tdnn7-target dim=$dim target-rms=0.5 - output-layer name=output-xent-target dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 -EOF - # edits.config contains edits required to train transferred model. - # e.g. substitute output-node of previous model with new output - # and removing orphan nodes and components. - cat < $dir/configs/edits.config - remove-output-nodes name=output - remove-output-nodes name=output-xent - rename-node old-name=output-target new-name=output - rename-node old-name=output-xent-target new-name=output-xent - remove-orphans -EOF - steps/nnet3/xconfig_to_configs.py --existing-model $src_mdl \ - --xconfig-file $dir/configs/network.xconfig \ - --edits-config $dir/configs/edits.config \ - --config-dir $dir/configs/ -fi - -if [ $stage -le 8 ]; then - echo "$0: generate egs for chain to train new model on rm dataset." - if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then - utils/create_split_dir.pl \ - /export/b0{3,4,5,6}/$USER/kaldi-data/egs/rm-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage - fi - echo "$0: set the learning-rate-factor for initial network to be zero." - nnet3-am-copy --raw=true --edits="set-learning-rate-factor name=* learning-rate-factor=$primary_lr_factor" \ - $src_mdl $dir/init.raw || exit 1; - - steps/nnet3/chain/train.py --stage $train_stage \ - --cmd "$decode_cmd" \ - --feat.online-ivector-dir exp/nnet2${nnet_affix}/ivectors \ - --chain.xent-regularize $xent_regularize \ - --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ - --chain.xent-regularize 0.1 \ - --chain.leaky-hmm-coefficient 0.1 \ - --chain.l2-regularize 0.00005 \ - --chain.apply-deriv-weights false \ - --chain.lm-opts="--num-extra-lm-states=200" \ - --egs.dir "$common_egs_dir" \ - --egs.opts "--frames-overlap-per-eg 0" \ - --egs.chunk-width $frames_per_eg \ - --trainer.num-chunk-per-minibatch=$minibatch_size \ - --trainer.frames-per-iter 1000000 \ - --trainer.num-epochs $num_epochs \ - --trainer.optimization.num-jobs-initial=$num_jobs_initial \ - --trainer.optimization.num-jobs-final=$num_jobs_final \ - --trainer.optimization.initial-effective-lrate=$initial_effective_lrate \ - --trainer.optimization.final-effective-lrate=$final_effective_lrate \ - --trainer.max-param-change $max_param_change \ - --cleanup.remove-egs false \ - --feat-dir data/train_hires \ - --tree-dir $treedir \ - --lat-dir exp/tri3b_lats \ - --dir $dir || exit 1; -fi - -if [ $stage -le 9 ]; then - steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 4 \ - data/test $srcdir/exp/nnet3/extractor exp/nnet2${nnet_affix}/ivectors_test || exit 1; -fi - -if [ $stage -le 10 ]; then - # Note: it might appear that this $lang directory is mismatched, and it is as - # far as the 'topo' is concerned, but this script doesn't read the 'topo' from - # the lang directory. - utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph - steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ - --scoring-opts "--min-lmwt 1" \ - --nj 20 --cmd "$decode_cmd" \ - --online-ivector-dir exp/nnet2${nnet_affix}/ivectors_test \ - $dir/graph data/test_hires $dir/decode || exit 1; -fi - -if [ $stage -le 11 ]; then - utils/mkgraph.sh --self-loop-scale 1.0 data/lang_ug $dir $dir/graph_ug - steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ - --nj 20 --cmd "$decode_cmd" \ - --online-ivector-dir exp/nnet2${nnet_affix}/ivectors_test \ - $dir/graph_ug data/test_hires $dir/decode_ug || exit 1; -fi -wait; -exit 0; diff --git a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1a.sh b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1a.sh index e232bd39e46..6c224e35458 100755 --- a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1a.sh +++ b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1a.sh @@ -1,12 +1,12 @@ #!/bin/bash # This script uses weight transfer as a Transfer learning method -# and use already trained model on wsj and remove the last layer and -# add new randomly initialized layer and retrain the whole network. +# and use already trained model on wsj and removes the last layer and +# add new randomly initialized layer and retrain the whole network, # while training new added layer using rm data. -# The chain config is as run_tdnn_5n.sh and the result is: -#System tdnn_5n tdnn_wsj_rm -#WER 2.71 2.21 +# The chain config is as in run_tdnn_5n.sh and the result is: +#System tdnn_5n tdnn_wsj_rm_1a +#WER 2.71 2.09 set -e # configs for 'chain' @@ -133,7 +133,7 @@ if [ $stage -le 8 ]; then /export/b0{3,4,5,6}/$USER/kaldi-data/egs/rm-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage fi echo "$0: set the learning-rate-factor for initial network to be zero." - nnet3-am-copy --raw=true --edits="set-learning-rate-factor name=* learning-rate-factor=$primary_lr_factor" \ + nnet3-copy --edits="set-learning-rate-factor name=* learning-rate-factor=$primary_lr_factor" \ $src_mdl $dir/init.raw || exit 1; steps/nnet3/chain/train.py --stage $train_stage \ diff --git a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1b.sh b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1b.sh index 5a020e9e0a2..e1e3cc35ada 100755 --- a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1b.sh +++ b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1b.sh @@ -44,7 +44,7 @@ src_tree_dir=$srcdir/exp/chain/tree_a_sp # chain tree-dir for src data; primary_lr_factor=0.25 # learning-rate factor for all except last layer in transferring source model final_lr_factor=1.0 # learning-rate factor for final layer in transferring source model. nnet_affix=_online_wsj -tg_lm_scale=10 +tgt_lm_scale=10 src_lm_scale=1 tdnn_affix=_1b # End configuration section. @@ -76,7 +76,7 @@ dir=exp/chain/tdnn_wsj_rm${tdnn_affix} if [ $stage -le -1 ]; then echo "$0: prepare lexicon.txt for RM using WSJ lexicon." if ! cmp -s <(grep -v "^#" $src_lang/phones.txt) <(grep -v "^#" data/lang/phones.txt); then - local/prepare_wsj_rm_lang.sh $srcdir/data/local/dict_nosp $srcdir/data/lang/phones.txt $lang_dir + local/prepare_wsj_rm_lang.sh $srcdir/data/local/dict_nosp $srcdir/data/lang $lang_dir else rm -rf $lang_dir cp -r data/lang $lang_dir @@ -105,38 +105,14 @@ if [ $stage -le 5 ]; then rm $lat_dir/fsts.*.gz # save space fi -converted_ali_dir=exp/converted_ali_wsj if [ $stage -le 6 ]; then - echo "$0: convert target alignment using tree in src-tree-dir" - mkdir -p $converted_ali_dir - mkdir -p $converted_ali_dir/log - num_ali_job=`cat $ali_dir/num_jobs` - cp $ali_dir/num_jobs $converted_ali_dir - cp $src_tree_dir/{tree,final.mdl} $converted_ali_dir - $decode_cmd JOB=1:$num_ali_job $converted_ali_dir/log/convert_ali.JOB.log \ - convert-ali $ali_dir/final.mdl $src_tree_dir/final.mdl \ - $src_tree_dir/tree "ark:gunzip -c $ali_dir/ali.JOB.gz |" \ - "ark:| gzip -c > $converted_ali_dir/ali.JOB.gz" -fi - - -if [ $stage -le 7 ]; then echo "$0: creating neural net configs using the xconfig parser for"; echo "extra layers w.r.t source network."; num_targets=$(tree-info $src_tree_dir/tree |grep num-pdfs|awk '{print $2}') learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) mkdir -p $dir mkdir -p $dir/configs - cat < $dir/configs/network.xconfig - output-layer name=output-tmp input=tdnn6.renorm dim=$num_targets -EOF - # edits.config contains edits required to train transferred model. - # e.g. substitute output-node of previous model with new output - # and removing orphan nodes and components. - cat < $dir/configs/edits.config - remove-output-nodes name=output-tmp - remove-orphans -EOF + touch $dir/configs/network.xconfig steps/nnet3/xconfig_to_configs.py --existing-model $src_mdl \ --xconfig-file $dir/configs/network.xconfig \ --edits-config $dir/configs/edits.config \ @@ -144,7 +120,7 @@ EOF fi -if [ $stage -le 8 ]; then +if [ $stage -le 7 ]; then echo "$0: generate egs for chain to train new model on rm dataset." if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then utils/create_split_dir.pl \ @@ -159,7 +135,7 @@ if [ $stage -le 8 ]; then --cmd "$decode_cmd" \ --feat.online-ivector-dir exp/nnet2${nnet_affix}/ivectors \ --chain.xent-regularize $xent_regularize \ - --chain.alignments-for-lm="$converted_ali_dir:$tg_lm_scale,$src_tree_dir:$src_lm_scale" \ + --chain.alignments-for-lm="$ali_dir:$tgt_lm_scale,$src_tree_dir:$src_lm_scale" \ --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ --chain.xent-regularize 0.1 \ --chain.leaky-hmm-coefficient 0.1 \ @@ -184,12 +160,12 @@ if [ $stage -le 8 ]; then --dir $dir || exit 1; fi -if [ $stage -le 9 ]; then +if [ $stage -le 8 ]; then steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 4 \ data/test_hires $srcdir/exp/nnet3/extractor exp/nnet2${nnet_affix}/ivectors_test || exit 1; fi -if [ $stage -le 10 ]; then +if [ $stage -le 9 ]; then # Note: it might appear that this $lang directory is mismatched, and it is as # far as the 'topo' is concerned, but this script doesn't read the 'topo' from # the lang directory. @@ -201,7 +177,7 @@ if [ $stage -le 10 ]; then $dir/graph data/test_hires $dir/decode || exit 1; fi -if [ $stage -le 11 ]; then +if [ $stage -le 10 ]; then utils/mkgraph.sh --self-loop-scale 1.0 $lang_ug_dir $dir $dir/graph_ug steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ --nj 20 --cmd "$decode_cmd" \ diff --git a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1c.sh b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1c.sh index 5f4aa43d01a..a6e1616c25e 100755 --- a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1c.sh +++ b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1c.sh @@ -74,7 +74,7 @@ lat_dir=exp/chain_lats${src_tree_dir:+_wsj} if [ $stage -le -1 ]; then echo "$0: prepare lang for RM-WSJ using WSJ phone set and lexicon and RM word list." if ! cmp -s <(grep -v "^#" $src_lang/phones.txt) <(grep -v "^#" data/lang/phones.txt); then - local/prepare_wsj_rm_lang.sh $srcdir/data/local/dict_nosp $src_lang/phones.txt $lang_dir + local/prepare_wsj_rm_lang.sh $srcdir/data/local/dict_nosp $src_lang $lang_dir else rm -rf $lang_dir cp -r data/lang $lang_dir @@ -119,14 +119,7 @@ if [ $stage -le 6 ]; then num_targets=$(tree-info $src_tree_dir/tree |grep num-pdfs|awk '{print $2}') mkdir -p $dir mkdir -p $dir/configs - cat < $dir/configs/network.xconfig - output-layer name=output-tmp input=tdnn6.renorm dim=$num_targets -EOF - - cat < $dir/configs/edits.config - remove-output-nodes name=output-tmp - remove-orphans -EOF + touch $dir/configs/network.xconfig steps/nnet3/xconfig_to_configs.py --existing-model $src_mdl \ --xconfig-file $dir/configs/network.xconfig \ --edits-config $dir/configs/edits.config \ diff --git a/egs/rm/s5/local/prepare_wsj_rm_lang.sh b/egs/rm/s5/local/prepare_wsj_rm_lang.sh index 56dd351a8d2..10a01514ca0 100755 --- a/egs/rm/s5/local/prepare_wsj_rm_lang.sh +++ b/egs/rm/s5/local/prepare_wsj_rm_lang.sh @@ -1,65 +1,68 @@ #!/bin/bash # Copyright 2017 Pegah Ghahremani -# This script prepares a dictionary for sourc-target using source phone set, lexicon and dict and -# target words.txt are copied from source lexicon for common words in source -# and target. words in target that are not available in the source lexicon are added +# This script prepares a dictionary for wsj-rm experiment using wsj phone set, lexicon and dict and +# rm's words.txt are copied from wsj lexicon for common words in wsj +# and rm. words in rm that are not available in the wsj lexicon are added # as oov in lexicon.txt. -# The is also added to words.txt and G.fst is recompiled using +# The oov word "" in wsj is also added to words.txt and G.fst is recompiled using # updated word list. +if [ -f path.sh ]; then . ./path.sh; fi . utils/parse_options.sh if [ $# != 3 ]; then - echo "Usage: local/prepare_wsj_rm_lang.sh " + echo "Usage: local/prepare_wsj_rm_lang.sh " echo "e.g:" - echo "$0 ../../wsj/s5/data/local/dict ../../wsj/s5/data/lang/phones.txt data/wsj_rm_dir" + echo "$0 ../../wsj/s5/data/local/dict ../../wsj/s5/data/lang_nosp data/wsj_rm_dir" fi src_dict=$1 -src_phones=$2 -src_tg_lang=$3 +src_lang=$2 +src_tgt_lang=$3 -required_dict_files="lexicon.txt nonsilence_phones.txt silence_phones.txt optional_silence.txt" +required_dict_files="$src_dict/lexicon.txt $src_dict/nonsilence_phones.txt $src_dict/silence_phones.txt $src_dict/optional_silence.txt $src_lang/oov.txt $src_lang/phones.txt" for f in $required_dict_files; do - if [ ! -f $src_dict/$f ]; then - echo "file $src_dict/$f that is required for preparing lang does not exists." && exit 1; + if [ ! -f $f ]; then + echo "file $f that is required for preparing lang does not exists." && exit 1; fi done -rm -rf $src_tg_lang -mkdir -p $src_tg_lang -mkdir -p $src_tg_lang/local +rm -rf $src_tgt_lang +mkdir -p $src_tgt_lang +mkdir -p $src_tgt_lang/local # copy *phones.txt from source to target. -cp -r $src_dict $src_tg_lang/local/dict -rm $src_tg_lang/local/dict/lexicon.txt +cp -r $src_dict $src_tgt_lang/local/dict +rm $src_tgt_lang/local/dict/lexicon*.txt +oov_word=`cat $src_lang/oov.txt` # common word list in rm lexicon with lexicon in wsj comm -12 <(awk '{print $1}' data/local/dict/lexicon.txt | sed "s/\+/\'/g" | sort) \ <(awk '{print $1}' $src_dict/lexicon.txt | sort) | \ -sed -r "s/'/+/g" | sort > $src_tg_lang/words_tmp.txt +sed -r "s/'/+/g" | sort > $src_tgt_lang/words_tmp.txt comm -23 <(awk '{print $1}' data/local/dict/lexicon.txt | sed "s/\+/\'/g" | sort) \ <(awk '{print $1}' $src_dict/lexicon.txt | sort) | \ -sed -r "s/'/+/g" | sort > $src_tg_lang/words_only_tg.txt +sed -r "s/'/+/g" | sort > $src_tgt_lang/words_only_tgt.txt # add to rm_swj_word list -(echo ""; cat $src_tg_lang/words_tmp.txt) | sort > $src_tg_lang/words_tg_src.txt -rm $src_tg_lang/words_tmp.txt +(echo "$oov_word"; cat $src_tgt_lang/words_tmp.txt) | sort > $src_tgt_lang/words_tgt_src.txt +rm $src_tgt_lang/words_tmp.txt -# we use wsj lexicon and find common word list in rm and wsj to generate lexicon for rm +# we use wsj lexicon and find common word list in rm and wsj to generate lexicon for rm-wsj # using wsj phone sets. More than 90% of words in RM are in WSJ(950/994). -cat $src_tg_lang/words_tg_src.txt | sed "s/\+/\'/g" | \ +cat $src_tgt_lang/words_tgt_src.txt | sed "s/\+/\'/g" | \ utils/apply_map.pl --permissive $src_dict/lexicon.txt | \ -paste <(cat $src_tg_lang/words_tg_src.txt) - > $src_tg_lang/local/dict/lexicon_tg_src.txt +paste <(cat $src_tgt_lang/words_tgt_src.txt) - > $src_tgt_lang/local/dict/lexicon_tgt_src.txt # extend lexicon.txt by adding only_tg words as oov. -cat $src_tg_lang/local/dict/lexicon_tg_src.txt <(sed 's/$/ SPN/g' $src_tg_lang/words_only_tg.txt) | sort -u > $src_tg_lang/local/dict/lexicon.txt +oov_phone=`grep "$oov_word" $src_dict/lexicon.txt | cut -d' ' -f2` +cat $src_tgt_lang/local/dict/lexicon_tgt_src.txt <(sed 's/$/ SPN/g' $src_tgt_lang/words_only_tgt.txt) | sort -u > $src_tgt_lang/local/dict/lexicon.txt # prepare dictionary using new lexicon.txt for RM-SWJ. -utils/prepare_lang.sh --phone-symbol-table $src_phones \ -$src_tg_lang/local/dict "" $src_tg_lang/local/lang_tmp $src_tg_lang +utils/prepare_lang.sh --phone-symbol-table $src_lang/phones.txt \ +$src_tgt_lang/local/dict "$oov_word" $src_tgt_lang/local/lang_tmp $src_tgt_lang # Generate new G.fst using updated words list with added -fstcompile --isymbols=$src_tg_lang/words.txt --osymbols=$src_tg_lang/words.txt --keep_isymbols=false \ - -keep_osymbols=false data/local/tmp/G.txt | fstarcsort --sort_type=ilabel > $src_tg_lang/G.fst || exit 1; +fstcompile --isymbols=$src_tgt_lang/words.txt --osymbols=$src_tgt_lang/words.txt --keep_isymbols=false \ + -keep_osymbols=false data/local/tmp/G.txt | fstarcsort --sort_type=ilabel > $src_tgt_lang/G.fst || exit 1; diff --git a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py index d18c695f662..69969ed0535 100644 --- a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py +++ b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py @@ -21,29 +21,35 @@ logger.addHandler(logging.NullHandler()) -def create_phone_lm(dir, tree_dir, run_opts, lm_opts=None): +def create_phone_lm(dir, tree_dir, run_opts, alignment_dirs=None, lm_opts=None): """Create a phone LM for chain training. - This method trains a weighted phone LM for chain training generated by - weighted combination of counts in alignments - in "tree_dir" - "tree_dir" is a list of comma-separated alignments directories with - same tree and phone sets (each containing tree and ali.*.gz) - and the colon-separated integer weight appended + This method trains a phone LM usingalignments. + If alignment_dirs is non empty, it trains weighted phone LM for chain + training generated by weighted combination of counts in alignments provided + in "alignments_dir", which is a list of comma-separated alignment directories + with same phone sets (each containing tree , final.mdl and ali.*.gz) + and the colon-separated integer weights appended to each alignment dir.(if none, the weight is 1) - i.e. ali_dir1:w1,ali_dir2:w2 + i.e. alignment_dirs="ali_dir1:w1,ali_dir2:w2" + If "alignment_dirs is empty, phone LM generated using alignments in tree_dir. """ - tree_dirs = tree_dir.split(",") ali_dirs = [] scales = [] - for tree_dir in tree_dirs: - ali_and_weights = tree_dir.split(":") - ali_dirs.append(ali_and_weights[0]) - if (len(ali_and_weights) == 2): - scales.append(str(int(ali_and_weights[1]))) - else: - scales.append(1) - phone_lists = [] + if alignment_dirs is not None: + ali_weight_dirs = alignment_dirs.split(",") + for ali_weight_dir in ali_weight_dirs: + ali_and_weights = ali_weight_dir.split(":") + ali_dirs.append(ali_and_weights[0]) + if (len(ali_and_weights) == 2): + assert(ali_and_weights[1].isdigit()) + scales.append(ali_and_weights[1]) + else: + scales.append("1") + else: + ali_dirs.append(tree_dir) + + phone_alignments_list = [] for ali_dir in ali_dirs: try: f = open(ali_dir + "/num_jobs", 'r') @@ -53,23 +59,26 @@ def create_phone_lm(dir, tree_dir, run_opts, lm_opts=None): raise Exception("""There was an error getting the number of alignment jobs from {0}/num_jobs""".format(ali_dir)) - assert(filecmp.cmp('{0}/tree'.format(ali_dirs[0]), '{0}/tree'.format(ali_dir))) + common_lib.execute_command( + """ utils/lang/check_phones_compatible.sh \ + {0}/phones.txt {1}/phones.txt""".format(tree_dir, ali_dir)) alignments=' '.join(['{0}/ali.{1}.gz'.format(ali_dir, job) for job in range(1, num_ali_jobs + 1)]) - phone_lists.append("""'ark:gunzip -c {0} | ali-to-phones {1}/final.mdl ark:- ark:-|'""".format(alignments, ali_dir)) - phone_lists_str=' '.join(phone_lists) + phone_alignments_list.append("""'ark:gunzip -c {0} | ali-to-phones {1}/final.mdl ark:- ark:-|'""".format(alignments, ali_dir)) + + phone_alignments_list_str=' '.join(phone_alignments_list) if len(scales) > 0: scales_str=','.join(scales) lm_opts="{0} --scales={1}".format(lm_opts, scales_str) common_lib.execute_command( """{command} {dir}/log/make_phone_lm.log \ - chain-est-phone-lm {lm_opts} {phone_lists} {dir}/phone_lm.fst""".format( + chain-est-phone-lm {lm_opts} {phone_alignments_list} {dir}/phone_lm.fst""".format( command=run_opts.command, dir=dir, lm_opts=lm_opts if lm_opts is not None else '', - phone_lists=phone_lists_str)) + phone_alignments_list=phone_alignments_list_str)) def create_denominator_fst(dir, tree_dir, run_opts): diff --git a/egs/wsj/s5/steps/libs/nnet3/train/common.py b/egs/wsj/s5/steps/libs/nnet3/train/common.py index 17be05f0b60..a96f4efaa49 100644 --- a/egs/wsj/s5/steps/libs/nnet3/train/common.py +++ b/egs/wsj/s5/steps/libs/nnet3/train/common.py @@ -502,7 +502,21 @@ def smooth_presoftmax_prior_scale_vector(pdf_counts, def prepare_initial_network(dir, run_opts, srand=-3): - if not os.path.exists("{0}/configs/init.config".format(dir)): + """ This function prepares "0.raw" model by adding layers + in final.config. + If dir/init.raw exists, the layers in "final.config" added to "init.raw", + otherwise the new model is initialized using final.config. + If {dir}/configs/edit.config exists, the intermediate model "0.pre-edited.raw" + is generated by modifying "init.raw" by adding layers in "final.config" and + this model is eddited using edits.config as "0.raw". + edits.config applied in cases where the initial network "init.raw" + is copied from another setup (i.e. Weight transfer and we need to rename + output nodes) and there is no init.config in config dir to generate "init.raw". + An example of edits.config is: + i.e. remove-output-nodes name=output + rename-node old-name= new-name=output + """ + if os.path.exists("{0}/init.raw".format(dir)): edits_config_file = "{0}/configs/edits.config".format(dir) common_lib.execute_command( """{command} {dir}/log/add_first_layer.log \ @@ -512,13 +526,8 @@ def prepare_initial_network(dir, run_opts, srand=-3): dir=dir, edit_suffix = ('.pre-edited' if os.path.exists(edits_config_file) else ''))) - # Modify 0.pre-edited.raw using edits.config before adding transition model, - # that can be used in transfer learning. - # It is assumed that there is no init.config and 0.raw does not generated - # using init.config and it is copied from another setup. - # An example of edits.config is: - # i.e. remove-output-nodes name=output - # rename-node old-name= new-name=output + assert(os.path.exists("{0}/configs/edits.config".format(dir)) or + os.path.exists(edits_config_file)) if os.path.exists(edits_config_file): logger.info("edits 0.raw model using {0}/configs/edits.config." "".format(dir)) @@ -538,7 +547,6 @@ def prepare_initial_network(dir, run_opts, srand=-3): dir=dir)) - def get_model_combine_iters(num_iters, num_epochs, num_archives, max_models_combine, num_jobs_final): diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py index 5449477da0e..f890dd878a9 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py @@ -111,21 +111,22 @@ def get_model_component_info(model_filename): key_to_value = dict() layer_names.append(layer_name) key_to_value['name'] = layer_name - if input_dim != -1: - if output_dim == -1: - # The layer type is input-node. - key_to_value['dim'] = input_dim - else: - # The layer type is component-node - assert(input_str is not None) - key_to_value['dim'] = output_dim - all_layers.append(xlayers.XconfigInputLayer('input', key_to_value, all_layers)) + assert(input_dim != -1) + if output_dim == -1: + # The layer type is input-node. + key_to_value['dim'] = input_dim + else: + # The layer type is component-node + assert(input_str is not None) + key_to_value['dim'] = output_dim + all_layers.append(xlayers.XconfigInputLayer('input', key_to_value, all_layers)) if len(all_layers) == 0: raise RuntimeError("{0}: model filename '{1}' is empty.".format( sys.argv[0], model_filename)) f.close() return all_layers + # This function reads an xconfig file and returns it as a list of layers # (usually we use the variable name 'all_layers' elsewhere for this). # It will die if the xconfig file is empty or if there was diff --git a/egs/wsj/s5/steps/nnet3/chain/train_more.py b/egs/wsj/s5/steps/nnet3/chain/train_more.py index 3577719aed3..047a20732b6 100755 --- a/egs/wsj/s5/steps/nnet3/chain/train_more.py +++ b/egs/wsj/s5/steps/nnet3/chain/train_more.py @@ -4,7 +4,8 @@ # 2016 Vimal Manohar # Apache 2.0. -""" This script is based on steps/nnet3/chain/train.sh +""" This script is as steps/nnet3/chain/train.py but it is used for weight transfer + and generates weighted phone LM for chain model using multiple alignment sources. """ import argparse @@ -71,7 +72,7 @@ def get_args(): (separated with colon), used to generate weighted phone language model for denominator FST. The phone sets should be similar - for all alignment dirs and tree-dir. + for all alignment dirs. If empty, alignments in tree-dir used for phone LM generation. e.g. "src1/ali_dir:10.0,src2/ali_dir:2.0" @@ -287,7 +288,7 @@ def train(args, run_opts): args.lat_dir) # Set some variables. - num_jobs = 4 #common_lib.get_number_of_jobs(args.lat_dir) + num_jobs = common_lib.get_number_of_jobs(args.lat_dir) feat_dim = common_lib.get_feat_dim(args.feat_dir) ivector_dim = common_lib.get_ivector_dim(args.online_ivector_dir) ivector_id = common_lib.get_ivector_extractor_id(args.online_ivector_dir) @@ -326,9 +327,8 @@ def train(args, run_opts): # transform. if (args.stage <= -6): logger.info("Creating phone language-model") - chain_lib.create_phone_lm(args.dir, - (args.tree_dir if args.alignments_for_lm is None - else args.alignments_for_lm), run_opts, + chain_lib.create_phone_lm(args.dir, args.tree_dir, run_opts, + alignment_dirs=args.alignments_for_lm, lm_opts=args.lm_opts) if (args.stage <= -5): @@ -391,6 +391,8 @@ def train(args, run_opts): egs_left_context, egs_right_context, egs_left_context_initial, egs_right_context_final)) + + # check the cegs.*.ark is newer than phone LM "phone_lm.fst". assert(os.path.getctime('{0}/cegs.1.ark'.format(egs_dir)) > os.path.getctime('{0}/phone_lm.fst'.format(args.dir))) assert(args.chunk_width == frames_per_eg_str) diff --git a/egs/wsj/s5/steps/nnet3/train_dnn.py b/egs/wsj/s5/steps/nnet3/train_dnn.py index f9815e7fcd5..3b2c09a7fa9 100755 --- a/egs/wsj/s5/steps/nnet3/train_dnn.py +++ b/egs/wsj/s5/steps/nnet3/train_dnn.py @@ -208,8 +208,7 @@ def train(args, run_opts): nnet3-init --srand=-2 {dir}/configs/init.config \ {dir}/init.raw""".format(command=run_opts.command, dir=args.dir)) - else: - assert(os.path.exists(args.dir+"/init.raw")) + assert(os.path.exists(args.dir+"/init.raw")) default_egs_dir = '{0}/egs'.format(args.dir) if (args.stage <= -4) and args.egs_dir is None: diff --git a/egs/wsj/s5/steps/nnet3/xconfig_to_configs.py b/egs/wsj/s5/steps/nnet3/xconfig_to_configs.py index 79e9ea5234d..e0592aa41c6 100755 --- a/egs/wsj/s5/steps/nnet3/xconfig_to_configs.py +++ b/egs/wsj/s5/steps/nnet3/xconfig_to_configs.py @@ -228,7 +228,7 @@ def write_config_files(config_dir, all_layers): def add_nnet_context_info(config_dir, existing_model=None, - edits_config=None): + edits_config=None): """This will be removed when python script refactoring is done.""" model = "{0}/ref.raw".format(config_dir) if edits_config is not None: diff --git a/src/chainbin/chain-est-phone-lm.cc b/src/chainbin/chain-est-phone-lm.cc index 3b36d81ce8a..3e82417c315 100644 --- a/src/chainbin/chain-est-phone-lm.cc +++ b/src/chainbin/chain-est-phone-lm.cc @@ -32,7 +32,7 @@ int main(int argc, char *argv[]) { "Initialize un-smoothed phone language model for 'chain' training\n" "Output in FST format (epsilon-free deterministic acceptor)\n" "\n" - "Usage: chain-est-phone-lm [options] ... \n" + "Usage: chain-est-phone-lm [options] [... ] \n" "The phone-sequences are used to train a language model.\n" "e.g.:\n" "gunzip -c input_dir/ali.*.gz | ali-to-phones input_dir/final.mdl ark:- ark:- | \\\n" diff --git a/src/nnet3/nnet-utils.cc b/src/nnet3/nnet-utils.cc index bafc5ef5216..84e934c22b4 100644 --- a/src/nnet3/nnet-utils.cc +++ b/src/nnet3/nnet-utils.cc @@ -656,7 +656,7 @@ void ReadEditConfig(std::istream &edit_config_is, Nnet *nnet) { KALDI_LOG << "Set learning rates for " << num_learning_rates_set << " nodes."; } else if (directive == "set-learning-rate-factor") { std::string name_pattern = "*"; - // name_pattern defaults to '*' if non is given. + // name_pattern defaults to '*' if none is given. config_line.GetValue("name", &name_pattern); BaseFloat learning_rate_factor = -1; if (!config_line.GetValue("learning-rate-factor", &learning_rate_factor)) { From 48d81619a14a0f44f5a2cd01b807a43f17fca522 Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Sun, 23 Jul 2017 18:00:54 -0400 Subject: [PATCH 029/174] chain: Fixes for silence --- src/chain/chain-denominator-smbr.cc | 61 +++++++++++++++++------- src/chain/chain-denominator-smbr.h | 24 ++++++++-- src/chain/chain-kernels-ansi.h | 4 +- src/chain/chain-smbr-kernels.cu | 23 +++++---- src/chain/chain-training.cc | 35 +++++++++++--- src/chain/chain-training.h | 21 +++++++- src/chainbin/nnet3-chain-compute-prob.cc | 3 ++ src/cudamatrix/cu-matrix.h | 4 +- src/nnet3/nnet-chain-training.cc | 27 ++++++++++- src/nnet3/nnet-chain-training.h | 2 + src/nnet3/nnet-component-itf.cc | 14 ++++-- src/nnet3/nnet-simple-component.cc | 9 ++++ src/nnet3/nnet-simple-component.h | 3 ++ 13 files changed, 185 insertions(+), 45 deletions(-) diff --git a/src/chain/chain-denominator-smbr.cc b/src/chain/chain-denominator-smbr.cc index 41a1110502a..87eecbb0073 100644 --- a/src/chain/chain-denominator-smbr.cc +++ b/src/chain/chain-denominator-smbr.cc @@ -53,6 +53,8 @@ DenominatorSmbrComputation::DenominatorSmbrComputation( kUndefined), tot_prob_(num_sequences_, kUndefined), tot_smbr_(num_sequences_, kUndefined), + tot_log_prob_(num_sequences_, kUndefined), + log_correction_term_(num_sequences_, kUndefined), ok_(true) { KALDI_ASSERT(opts_.leaky_hmm_coefficient >= 0.0 && opts_.leaky_hmm_coefficient < 1.0); @@ -229,7 +231,6 @@ void DenominatorSmbrComputation::AlphaSmbrDash(int32 t) { num_sequences_); alpha_smbr_sum_vec.AddRowSumMat(1.0, alpha_smbr_mat, 0.0); - BaseFloat alpha_sum = alpha_sum_vec.Sum(); KALDI_ASSERT(alpha_sum_vec.Min() > 0); alpha_smbr_mat.AddVecVec(opts_.leaky_hmm_coefficient, @@ -284,17 +285,17 @@ void DenominatorSmbrComputation::BetaSmbr(int32 t) { beta_smbr_dash_mat.DivElements(beta_dash_mat); } -BaseFloat DenominatorSmbrComputation::ForwardSmbr() { +BaseFloat DenominatorSmbrComputation::ForwardSmbr(BaseFloat *aux_objf) { AlphaSmbrFirstFrame(); AlphaSmbrDash(0); for (int32 t = 1; t <= frames_per_sequence_; t++) { AlphaSmbrGeneralFrame(t); AlphaSmbrDash(t); } - return ComputeTotObjf(); + return ComputeTotObjf(aux_objf); } -BaseFloat DenominatorSmbrComputation::ComputeTotObjf() { +BaseFloat DenominatorSmbrComputation::ComputeTotObjf(BaseFloat *aux_objf) { tot_prob_.Resize(num_sequences_); tot_smbr_.Resize(num_sequences_); // View the last alpha-dash as a matrix of size num-hmm-states by num-sequences. @@ -312,6 +313,29 @@ BaseFloat DenominatorSmbrComputation::ComputeTotObjf() { // Sum over all the HMM states for each sequence. tot_prob_.AddRowSumMat(1.0, last_alpha_dash, 0.0); + // we should probably add an ApplyLog() function that takes a vector argument. + tot_log_prob_ = tot_prob_; + tot_log_prob_.ApplyLog(); + BaseFloat tot_log_prob = tot_log_prob_.Sum(); + + // We now have to add something for the arbitrary scaling factor. [note: the + // purpose of the arbitrary scaling factors was to keep things in a good + // floating-point range] + // The inverses of all the tot-alpha quantities, for t = 0 + // ... frames_per_sequence_ - 1, were included as the 'arbitrary factors' in + // the transition-probs, so we need to multiply them all together (not + // inversed) and add them as a correction term to the total log-likes. + // These tot-alpha quantities were stored in the same place that we would + // have stored the HMM-state numbered 'num_hmm_states'. + int32 num_hmm_states = den_graph_.NumStates(); + CuSubMatrix inv_arbitrary_scales( + alpha_, 0, frames_per_sequence_, + num_sequences_ * num_hmm_states, num_sequences_); + CuMatrix log_inv_arbitrary_scales( + inv_arbitrary_scales); + log_inv_arbitrary_scales.ApplyLog(); + BaseFloat log_inv_arbitrary_scales_product = + log_inv_arbitrary_scales.Sum(); BaseFloat prob_sum = tot_prob_.Sum(); KALDI_ASSERT(prob_sum == prob_sum); @@ -321,8 +345,11 @@ BaseFloat DenominatorSmbrComputation::ComputeTotObjf() { last_alpha_smbr.MulElements(last_alpha_dash); tot_smbr_.AddRowSumMat(1.0, last_alpha_smbr, 0.0); tot_smbr_.DivElements(tot_prob_); - - return tot_smbr_.Sum(); + + if (aux_objf) + *aux_objf = -opts_.mmi_factor * ( + tot_log_prob + log_inv_arbitrary_scales_product); + return opts_.smbr_factor * tot_smbr_.Sum(); } @@ -333,7 +360,7 @@ bool DenominatorSmbrComputation::BackwardSmbr( BetaSmbrDashLastFrame(); BetaSmbr(frames_per_sequence_); for (int32 t = frames_per_sequence_ - 1; t >= 0; t--) { - BetaSmbrGeneralFrame(t); + BetaSmbrDashGeneralFrame(t); if (GetVerboseLevel() >= 1 || t == 0) BetaSmbrGeneralFrameDebug(t); BetaSmbr(t); @@ -386,7 +413,7 @@ void DenominatorSmbrComputation::BetaSmbrDashLastFrame() { beta_smbr_dash_vec.SetZero(); } -void DenominatorSmbrComputation::BetaSmbrGeneralFrame(int32 t) { +void DenominatorSmbrComputation::BetaSmbrDashGeneralFrame(int32 t) { KALDI_ASSERT(t >= 0 && t < frames_per_sequence_); int32 num_pdfs = exp_nnet_output_transposed_.NumRows(); // t_wrapped gives us the time-index we use when indexing @@ -431,7 +458,8 @@ void DenominatorSmbrComputation::BetaSmbrGeneralFrame(int32 t) { this_alpha_dash, this_alpha_smbr, next_beta, next_beta_smbr, this_beta_dash, this_beta_smbr, - log_prob_deriv.Data(), log_prob_deriv.Stride()); + log_prob_deriv.Data(), log_prob_deriv.Stride(), + opts_.mmi_factor, opts_.smbr_factor); CU_SAFE_CALL(cudaGetLastError()); if (dimGrid.y == num_hmm_states) { break; // this is the normal case. @@ -482,7 +510,9 @@ void DenominatorSmbrComputation::BetaSmbrGeneralFrame(int32 t) { tot_variable_factor += variable_factor; double this_gamma_r = occupation_factor * variable_factor * (this_alpha_smbr_i + post + next_beta_smbr_j - tot_smbr_(s)); - log_prob_deriv_data[pdf_id * deriv_stride + s] += this_gamma_r; + log_prob_deriv_data[pdf_id * deriv_stride + s] += + opts_.smbr_factor * this_gamma_r + - opts_.mmi_factor * occupation_factor; } this_beta_dash[h * num_sequences + s] = tot_variable_factor / inv_arbitrary_scale; @@ -540,15 +570,14 @@ void DenominatorSmbrComputation::BetaSmbrGeneralFrameDebug(int32 t) { << alpha_beta_smbr_sum << " != " << tot_smbr_sum; } - //BaseFloat acc = (VecVec(this_alpha_smbr, this_alpha_dash) - // + VecVec(this_beta_dash, this_beta_smbr)) - // / alpha_beta_product; // use higher tolerance, since we are using randomized pruning for the // log-prob derivatives. - if (GetVerboseLevel() > 1 || !ApproxEqual(this_log_prob_deriv_sum, 0, 0.01)) { + if (GetVerboseLevel() > 1 || !ApproxEqual( + this_log_prob_deriv_sum, -opts_.mmi_factor * num_sequences_, 0.01)) { KALDI_WARN << "On time " << t << ", log-prob-deriv sum " - << this_log_prob_deriv_sum << " != " << 0; - if (fabs(this_log_prob_deriv_sum - 0) > 2.0) { + << this_log_prob_deriv_sum << " != " + << opts_.mmi_factor * num_sequences_; + if (fabs(this_log_prob_deriv_sum + opts_.mmi_factor * num_sequences_) > 2.0) { KALDI_WARN << "Excessive error detected, will abandon this minibatch"; ok_ = false; } diff --git a/src/chain/chain-denominator-smbr.h b/src/chain/chain-denominator-smbr.h index a4c22ad4fd2..cebf63ccd40 100644 --- a/src/chain/chain-denominator-smbr.h +++ b/src/chain/chain-denominator-smbr.h @@ -232,7 +232,9 @@ class DenominatorSmbrComputation { // Does the forward computation, and returns the total objective summed // over all sequences. You will have to scale this by any supervision // weighting factor, manually. - BaseFloat ForwardSmbr(); + // aux_objf stores the value of the auxiliary MMI objective scaled by + // opts.mmi_factor + BaseFloat ForwardSmbr(BaseFloat *aux_objf); // this adds deriv_weight times (the derivative of the objective w.r.t. the // nnet output), to 'nnet_output_deriv'. @@ -260,11 +262,13 @@ class DenominatorSmbrComputation { // doing correction) log_correction_term_. Note, this won't be scaled by // 'deriv_scale' (which of course we haven't seen by the time this is called, // from the ForwardSmbr() computation). - BaseFloat ComputeTotObjf(); + // aux_objf stores the value of the auxiliary MMI objective scaled by + // opts.mmi_factor + BaseFloat ComputeTotObjf(BaseFloat *aux_objf); void BetaSmbrDashLastFrame(); // beta computation for 0 <= beta < num_time_steps_. - void BetaSmbrGeneralFrame(int32 t); + void BetaSmbrDashGeneralFrame(int32 t); // compute the beta quantity from the beta-dash quantity (relates to leaky hmm). void BetaSmbr(int32 t); @@ -326,10 +330,22 @@ class DenominatorSmbrComputation { // the total smbr for each sequence. CuVector tot_smbr_; - + + // the log of tot_prob_. + CuVector tot_log_prob_; + + // the log of the total correction term for each sequence, which is the + // product of the alpha-sums [used in the leaky-hmm computation] over all the + // frames. The 'correction terms' are terms that we divide the alphas and + // betas by in order to keep them in a good dynamic range. The product of + // them must be included in the total likelihood. + CuVector log_correction_term_; + bool ok_; }; + + } // namespace chain } // namespace kaldi diff --git a/src/chain/chain-kernels-ansi.h b/src/chain/chain-kernels-ansi.h index d37bcd83e5a..3515725cdcb 100644 --- a/src/chain/chain-kernels-ansi.h +++ b/src/chain/chain-kernels-ansi.h @@ -65,7 +65,9 @@ extern "C" { BaseFloat *this_beta, BaseFloat *this_beta_smbr, BaseFloat *log_prob_deriv, - int32_cuda log_prob_deriv_stride); + int32_cuda log_prob_deriv_stride, + BaseFloat mmi_factor, + BaseFloat smbr_factor); void cuda_chain_smbr_hmm_forward(dim3 Gr, dim3 Bl, const Int32Pair *backward_transitions, diff --git a/src/chain/chain-smbr-kernels.cu b/src/chain/chain-smbr-kernels.cu index cfcf19dffad..a1804149939 100644 --- a/src/chain/chain-smbr-kernels.cu +++ b/src/chain/chain-smbr-kernels.cu @@ -216,7 +216,8 @@ static void _cuda_chain_smbr_hmm_backward( const BaseFloat *this_alpha, const BaseFloat *this_alpha_smbr, const BaseFloat *next_beta, const BaseFloat *next_beta_smbr, BaseFloat *this_beta, BaseFloat *this_beta_smbr, - BaseFloat *log_prob_deriv, int32_cuda log_prob_deriv_stride) { + BaseFloat *log_prob_deriv, int32_cuda log_prob_deriv_stride, + BaseFloat mmi_factor, BaseFloat smbr_factor) { // 'forward_transitions', indexed by hmm-state, consists of [start, end] // indexes into the 'transition_info' array. This is about the transitions // *out of* this state. 'probs' contains the exponentiated neural net @@ -275,14 +276,16 @@ static void _cuda_chain_smbr_hmm_backward( tot_beta_smbr += (next_beta_smbr_j0 + num_post0) * variable_factor0 + (next_beta_smbr_j1 + num_post1) * variable_factor1; tot_variable_factor += variable_factor0 + variable_factor1; - BaseFloat this_gamma_r0 = occupation_factor * variable_factor0 + BaseFloat occupation_prob0 = variable_factor0 * occupation_factor; + BaseFloat this_gamma_r0 = occupation_prob0 * (this_alpha_smbr_i + num_post0 + next_beta_smbr_j0 - tot_smbr[s]); atomic_add(log_prob_deriv + (pdf_id0 * log_prob_deriv_stride + s), - this_gamma_r0); - BaseFloat this_gamma_r1 = occupation_factor * variable_factor1 + smbr_factor * this_gamma_r0 - mmi_factor * occupation_prob0); + BaseFloat occupation_prob1 = variable_factor1 * occupation_factor; + BaseFloat this_gamma_r1 = occupation_prob1 * (this_alpha_smbr_i + num_post1 + next_beta_smbr_j1 - tot_smbr[s]); atomic_add(log_prob_deriv + (pdf_id1 * log_prob_deriv_stride + s), - this_gamma_r1); + smbr_factor * this_gamma_r1 - mmi_factor * occupation_prob1); } if (trans_iter != trans_end) { // mop up the odd transition. @@ -296,10 +299,11 @@ static void _cuda_chain_smbr_hmm_backward( BaseFloat variable_factor0 = transition_prob0 * next_beta_j0 * prob0; tot_beta_smbr += (next_beta_smbr_j0 + num_post0) * variable_factor0; tot_variable_factor += variable_factor0; - BaseFloat this_gamma_r0 = occupation_factor * variable_factor0 + BaseFloat occupation_prob0 = variable_factor0 * occupation_factor; + BaseFloat this_gamma_r0 = occupation_prob0 * (this_alpha_smbr_i + num_post0 + next_beta_smbr_j0 - tot_smbr[s]); atomic_add(log_prob_deriv + (pdf_id0 * log_prob_deriv_stride + s), - this_gamma_r0); + smbr_factor * this_gamma_r0 - mmi_factor * occupation_prob0); } BaseFloat beta = tot_variable_factor / inv_arbitrary_scale; this_beta[h * num_sequences + s] = beta; @@ -342,12 +346,13 @@ void cuda_chain_smbr_hmm_backward( const BaseFloat *next_beta, const BaseFloat *next_beta_smbr, BaseFloat *this_beta, BaseFloat *this_beta_smbr, BaseFloat *log_prob_deriv, - int32_cuda log_prob_deriv_stride) { + int32_cuda log_prob_deriv_stride, + BaseFloat mmi_factor, BaseFloat smbr_factor) { _cuda_chain_smbr_hmm_backward<<>>( forward_transitions, transitions, num_sequences, num_hmm_states, probs, prob_stride, num_post, post_stride, tot_smbr, this_alpha, this_alpha_smbr, next_beta, next_beta_smbr, this_beta, this_beta_smbr, log_prob_deriv, - log_prob_deriv_stride); + log_prob_deriv_stride, mmi_factor, smbr_factor); } diff --git a/src/chain/chain-training.cc b/src/chain/chain-training.cc index 87e5829bfd3..fcde7d735f9 100644 --- a/src/chain/chain-training.cc +++ b/src/chain/chain-training.cc @@ -119,7 +119,8 @@ void ComputeChainSmbrObjfAndDeriv(const ChainTrainingOptions &opts, BaseFloat *l2_term, BaseFloat *weight, CuMatrixBase *nnet_output_deriv, - CuMatrixBase *xent_output_deriv) { + CuMatrixBase *xent_output_deriv, + const CuArray *sil_indices) { // num_posteriors is a matrix of size // (num_sequences * frames_per_sequence) x num_pdfs and is ordered in the // same way as nnet_output is i.e. @@ -127,27 +128,49 @@ void ComputeChainSmbrObjfAndDeriv(const ChainTrainingOptions &opts, // each sequence, and so on. CuMatrix num_posteriors(nnet_output.NumRows(), nnet_output.NumCols()); + + BaseFloat num_logprob_weighted; { NumeratorComputation numerator(supervision, nnet_output); // note: supervision.weight is included as a factor in the derivative from // the numerator object, and the logprob too. - numerator.Forward(); + num_logprob_weighted = opts.mmi_factor * numerator.Forward(); numerator.Backward(&num_posteriors); + + if (nnet_output_deriv && opts.mmi_factor != 0.0) { + nnet_output_deriv->CopyFromMat(num_posteriors); + nnet_output_deriv->Scale(opts.mmi_factor); + } + if (xent_output_deriv) { xent_output_deriv->CopyFromMat(num_posteriors); } } + + if (sil_indices) + num_posteriors.CopyCols(num_posteriors, *sil_indices); + DenominatorSmbrComputation denominator(opts, den_graph, supervision.num_sequences, nnet_output, num_posteriors); - BaseFloat smbr_objf = denominator.ForwardSmbr(); + + BaseFloat mmi_objf; + BaseFloat smbr_objf = denominator.ForwardSmbr(&mmi_objf); + + if (opts.mmi_factor != 0.0) { + DenominatorComputation denominator_mmi(opts, den_graph, + supervision.num_sequences, + nnet_output); + KALDI_ASSERT(kaldi::ApproxEqual(-mmi_objf, opts.mmi_factor * denominator_mmi.Forward())); + } + bool ok = true; if (nnet_output_deriv) { - nnet_output_deriv->SetZero(); + if (opts.mmi_factor == 0.0) nnet_output_deriv->SetZero(); ok = denominator.BackwardSmbr(supervision.weight, nnet_output_deriv); } - *objf = supervision.weight * smbr_objf; + *objf = supervision.weight * (smbr_objf + mmi_objf) + num_logprob_weighted; *weight = supervision.weight * supervision.num_sequences * supervision.frames_per_sequence; if (!((*objf) - (*objf) == 0) || !ok) { @@ -156,7 +179,7 @@ void ComputeChainSmbrObjfAndDeriv(const ChainTrainingOptions &opts, nnet_output_deriv->SetZero(); if (xent_output_deriv) xent_output_deriv->SetZero(); - BaseFloat default_objf = 0; + BaseFloat default_objf = -opts.mmi_factor * 10; KALDI_WARN << "Objective function is " << (*objf) << " and denominator computation (if done) returned " << std::boolalpha << ok diff --git a/src/chain/chain-training.h b/src/chain/chain-training.h index beca5c0b92f..798b342316d 100644 --- a/src/chain/chain-training.h +++ b/src/chain/chain-training.h @@ -63,8 +63,14 @@ struct ChainTrainingOptions { bool use_smbr_objective; + std::string silence_pdfs_str; + + BaseFloat mmi_factor; + BaseFloat smbr_factor; + ChainTrainingOptions(): l2_regularize(0.0), leaky_hmm_coefficient(1.0e-05), - xent_regularize(0.0), use_smbr_objective(false) { } + xent_regularize(0.0), use_smbr_objective(false), + mmi_factor(0.0), smbr_factor(1.0) { } void Register(OptionsItf *opts) { opts->Register("l2-regularize", &l2_regularize, "l2 regularization " @@ -82,6 +88,16 @@ struct ChainTrainingOptions { "its final nonlinearity."); opts->Register("use-smbr-objective", &use_smbr_objective, "Use SMBR objective instead of MMI"); + opts->Register("silence-pdfs", &silence_pdfs_str, + "A comma-separated list of silence pdfs. " + "It makes sense only when the silence pdfs are " + "context-independent."); + opts->Register("mmi-factor", &mmi_factor, + "When using smbr objective, interpolate mmi objective " + "with this weight"); + opts->Register("smbr-factor", &smbr_factor, + "When using smbr objective, interpolate smbr objective " + "with this weight"); } }; @@ -165,7 +181,8 @@ void ComputeChainSmbrObjfAndDeriv( BaseFloat *l2_term, BaseFloat *weight, CuMatrixBase *nnet_output_deriv, - CuMatrixBase *xent_output_deriv = NULL); + CuMatrixBase *xent_output_deriv = NULL, + const CuArray *sil_indices = NULL); } // namespace chain diff --git a/src/chainbin/nnet3-chain-compute-prob.cc b/src/chainbin/nnet3-chain-compute-prob.cc index 8cf25d4ad08..77a4e44f22d 100644 --- a/src/chainbin/nnet3-chain-compute-prob.cc +++ b/src/chainbin/nnet3-chain-compute-prob.cc @@ -82,6 +82,9 @@ int main(int argc, char *argv[]) { fst::StdVectorFst den_fst; ReadFstKaldi(den_fst_rxfilename, &den_fst); + if (GetVerboseLevel() > 2) + nnet_opts.compute_deriv = true; + NnetChainComputeProb chain_prob_computer(nnet_opts, chain_opts, den_fst, nnet); diff --git a/src/cudamatrix/cu-matrix.h b/src/cudamatrix/cu-matrix.h index b9f690e923c..40ca1ecff14 100644 --- a/src/cudamatrix/cu-matrix.h +++ b/src/cudamatrix/cu-matrix.h @@ -100,7 +100,6 @@ class CuMatrixBase { void CopyCols(const CuMatrixBase &src, const CuArray &indexes); - /// Add column indices[r] of src to column r. /// As a special case, if indexes[i] == -1, skip column i /// indices.size() must equal this->NumCols(), @@ -108,6 +107,9 @@ class CuMatrixBase { void AddCols(const CuMatrixBase &src, const CuArray &indices); + /// Sets all elements in column indexes defined by 'ids' to 'value' + void SetCols(Real value, const CuArray &ids); + /// Copies row r from row indexes[r] of src. /// As a special case, if indexes[i] < 0, sets row i to zero. /// src.NumCols() must equal this.NumCols() diff --git a/src/nnet3/nnet-chain-training.cc b/src/nnet3/nnet-chain-training.cc index c8f58bedebb..695bdb4f83c 100644 --- a/src/nnet3/nnet-chain-training.cc +++ b/src/nnet3/nnet-chain-training.cc @@ -55,6 +55,30 @@ NnetChainTrainer::NnetChainTrainer(const NnetChainTrainingOptions &opts, "Probably this is the first training iteration."; } } + + if (!opts.chain_config.silence_pdfs_str.empty()) { + std::vector silence_pdfs; + SplitStringToVector(opts.chain_config.silence_pdfs_str, ":,", false, + &silence_pdfs); + + int32 num_pdfs = nnet->OutputDim("output"); + std::vector indices(num_pdfs); + for (size_t i = 0; i < num_pdfs; i++) { + indices[i] = i; + } + + for (std::vector::iterator it = silence_pdfs.begin(); + it != silence_pdfs.end(); ++it) { + int32 pdf = std::atoi(it->c_str()); + if (pdf > num_pdfs) + KALDI_ERR << "Invalid pdf " << pdf << " in silence-pdfs " + << opts.chain_config.silence_pdfs_str; + indices[pdf] = -1; + } + + sil_indices_.Resize(num_pdfs); + sil_indices_.CopyFromVec(indices); + } } @@ -185,7 +209,8 @@ void NnetChainTrainer::ProcessOutputs(bool is_backstitch_step2, sup.supervision, nnet_output, &tot_objf, &tot_l2_term, &tot_weight, &nnet_output_deriv, - (use_xent ? &xent_deriv : NULL)); + (use_xent ? &xent_deriv : NULL), + sil_indices_.Dim() ? &sil_indices_ : NULL); } else { ComputeChainObjfAndDeriv(opts_.chain_config, den_graph_, sup.supervision, nnet_output, diff --git a/src/nnet3/nnet-chain-training.h b/src/nnet3/nnet-chain-training.h index b76608e2794..9d57217a87c 100644 --- a/src/nnet3/nnet-chain-training.h +++ b/src/nnet3/nnet-chain-training.h @@ -114,6 +114,8 @@ class NnetChainTrainer { // consistent dropout masks. It's set to a value derived from rand() // when the class is initialized. int32 srand_seed_; + + CuArray sil_indices_; }; diff --git a/src/nnet3/nnet-component-itf.cc b/src/nnet3/nnet-component-itf.cc index d462ce890d4..72002b4d921 100644 --- a/src/nnet3/nnet-component-itf.cc +++ b/src/nnet3/nnet-component-itf.cc @@ -335,7 +335,7 @@ std::string NonlinearComponent::Info() const { stream << ", self-repair-upper-threshold=" << self_repair_upper_threshold_; if (self_repair_scale_ != 0.0) stream << ", self-repair-scale=" << self_repair_scale_; - if (count_ > 0 && value_sum_.Dim() == dim_ && deriv_sum_.Dim() == dim_) { + if (count_ > 0 && value_sum_.Dim() == dim_) { stream << ", count=" << std::setprecision(3) << count_ << std::setprecision(6); stream << ", self-repaired-proportion=" @@ -345,11 +345,15 @@ std::string NonlinearComponent::Info() const { Vector value_avg(value_avg_dbl); value_avg.Scale(1.0 / count_); stream << ", value-avg=" << SummarizeVector(value_avg); - Vector deriv_avg_dbl(deriv_sum_); - Vector deriv_avg(deriv_avg_dbl); - deriv_avg.Scale(1.0 / count_); - stream << ", deriv-avg=" << SummarizeVector(deriv_avg); + + if (deriv_sum_.Dim() == dim_) { + Vector deriv_avg_dbl(deriv_sum_); + Vector deriv_avg(deriv_avg_dbl); + deriv_avg.Scale(1.0 / count_); + stream << ", deriv-avg=" << SummarizeVector(deriv_avg); + } } + return stream.str(); } diff --git a/src/nnet3/nnet-simple-component.cc b/src/nnet3/nnet-simple-component.cc index 3e589df029b..deee3fff687 100644 --- a/src/nnet3/nnet-simple-component.cc +++ b/src/nnet3/nnet-simple-component.cc @@ -2993,6 +2993,15 @@ void LogSoftmaxComponent::Backprop(const std::string &debug_info, in_deriv->DiffLogSoftmaxPerRow(out_value, out_deriv); } +void LogSoftmaxComponent::StoreStats(const CuMatrixBase &in_value, + const CuMatrixBase &out_value, + void *memo) { + // We don't store derivative stats for this component type, just activation + // stats. + CuMatrix out_exp(out_value); + out_exp.ApplyExp(); + StoreStatsInternal(out_exp, NULL); +} void FixedScaleComponent::Init(const CuVectorBase &scales) { KALDI_ASSERT(scales.Dim() != 0); diff --git a/src/nnet3/nnet-simple-component.h b/src/nnet3/nnet-simple-component.h index d19be76461a..9346fbbc34a 100644 --- a/src/nnet3/nnet-simple-component.h +++ b/src/nnet3/nnet-simple-component.h @@ -757,6 +757,9 @@ class LogSoftmaxComponent: public NonlinearComponent { void *memo, Component *to_update, CuMatrixBase *in_deriv) const; + virtual void StoreStats(const CuMatrixBase &in_value, + const CuMatrixBase &out_value, + void *memo); virtual Component* Copy() const { return new LogSoftmaxComponent(*this); } private: From 9fedda920293b37426de62a134cfabf39de10ad9 Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Sun, 23 Jul 2017 18:14:01 -0400 Subject: [PATCH 030/174] chain: Updating chain script --- .../nnet3/train/chain_objf/acoustic_model.py | 30 +++--- .../libs/nnet3/train/dropout_schedule.py | 11 +++ egs/wsj/s5/steps/nnet3/chain/train.py | 98 ++++++++++++++++--- 3 files changed, 106 insertions(+), 33 deletions(-) diff --git a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py index fc49f9fedff..7a98bc7dfae 100644 --- a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py +++ b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py @@ -132,7 +132,7 @@ def train_new_models(dir, iter, srand, num_jobs, shuffle_buffer_size, num_chunk_per_minibatch_str, frame_subsampling_factor, truncate_deriv_weights, run_opts, backstitch_training_scale=0.0, backstitch_training_interval=1, - use_smbr_objective=False): + smbr_opt=""): """ Called from train_one_iteration(), this method trains new models with 'num_jobs' jobs, and @@ -216,8 +216,7 @@ def train_new_models(dir, iter, srand, num_jobs, egs_dir=egs_dir, archive_index=archive_index, buf_size=shuffle_buffer_size, num_chunk_per_mb=num_chunk_per_minibatch_str, - smbr_opt="--use-smbr-objective" - if use_smbr_objective else ""), + smbr_opt=smbr_opt), require_zero_status=True) threads.append(thread) @@ -240,7 +239,7 @@ def train_one_iteration(dir, iter, srand, egs_dir, frame_subsampling_factor, truncate_deriv_weights, run_opts, dropout_edit_string="", backstitch_training_scale=0.0, backstitch_training_interval=1, - use_smbr_objective=False): + smbr_opt=""): """ Called from steps/nnet3/chain/train.py for one iteration for neural network training with LF-MMI objective @@ -273,7 +272,7 @@ def train_one_iteration(dir, iter, srand, egs_dir, dir=dir, iter=iter, egs_dir=egs_dir, l2_regularize=l2_regularize, xent_regularize=xent_regularize, leaky_hmm_coefficient=leaky_hmm_coefficient, run_opts=run_opts, - use_smbr_objective=use_smbr_objective) + smbr_opt=smbr_opt) if iter > 0: # Runs in the background @@ -304,7 +303,7 @@ def train_one_iteration(dir, iter, srand, egs_dir, if shrinkage_value != 1.0: shrink_info_str = ' and shrink value is {0}'.format(shrinkage_value) - objf_info = "" if not use_smbr_objective else "and objective is sMBR" + objf_info = "" if smbr_opt != "" else "and objective is sMBR" logger.info("On iteration {0}, learning rate is {1}" "{shrink_info} {objf_info}.".format( iter, learning_rate, @@ -333,7 +332,7 @@ def train_one_iteration(dir, iter, srand, egs_dir, backstitch_training_scale=(backstitch_training_scale * iter / 15 if iter < 15 else backstitch_training_scale), backstitch_training_interval=backstitch_training_interval, - use_smbr_objective=use_smbr_objective) + smbr_opt=smbr_opt) [models_to_average, best_model] = common_train_lib.get_successful_models( num_jobs, '{0}/log/train.{1}.%.log'.format(dir, iter)) @@ -460,8 +459,7 @@ def prepare_initial_acoustic_model(dir, run_opts, srand=-1): def compute_train_cv_probabilities(dir, iter, egs_dir, l2_regularize, xent_regularize, leaky_hmm_coefficient, - run_opts, - use_smbr_objective=False): + run_opts, smbr_opt=""): model = '{0}/{1}.mdl'.format(dir, iter) common_lib.background_command( @@ -475,8 +473,7 @@ def compute_train_cv_probabilities(dir, iter, egs_dir, l2_regularize, l2=l2_regularize, leaky=leaky_hmm_coefficient, xent_reg=xent_regularize, egs_dir=egs_dir, - smbr_opt="--use-smbr-objective" - if use_smbr_objective else "")) + smbr_opt=smbr_opt)) common_lib.background_command( @@ -490,8 +487,7 @@ def compute_train_cv_probabilities(dir, iter, egs_dir, l2_regularize, l2=l2_regularize, leaky=leaky_hmm_coefficient, xent_reg=xent_regularize, egs_dir=egs_dir, - smbr_opt="--use-smbr-objective" - if use_smbr_objective else "")) + smbr_opt=smbr_opt)) def compute_progress(dir, iter, run_opts): @@ -515,7 +511,7 @@ def compute_progress(dir, iter, run_opts): def combine_models(dir, num_iters, models_to_combine, num_chunk_per_minibatch_str, egs_dir, leaky_hmm_coefficient, l2_regularize, xent_regularize, run_opts, - sum_to_one_penalty=0.0, use_smbr_objective=False): + sum_to_one_penalty=0.0, smbr_opt=""): """ Function to do model combination In the nnet3 setup, the logic @@ -574,8 +570,7 @@ def combine_models(dir, num_iters, models_to_combine, num_chunk_per_minibatch_st num_chunk_per_mb=num_chunk_per_minibatch_str, num_iters=num_iters, egs_dir=egs_dir, - smbr_opt="--use-smbr-objective" if use_smbr_objective - else "")) + smbr_opt=smbr_opt)) # Compute the probability of the final, combined model with # the same subset we used for the previous compute_probs, as the @@ -584,5 +579,4 @@ def combine_models(dir, num_iters, models_to_combine, num_chunk_per_minibatch_st dir=dir, iter='final', egs_dir=egs_dir, l2_regularize=l2_regularize, xent_regularize=xent_regularize, leaky_hmm_coefficient=leaky_hmm_coefficient, - run_opts=run_opts, - use_smbr_objective=use_smbr_objective) + run_opts=run_opts, smbr_opt=smbr_opt) diff --git a/egs/wsj/s5/steps/libs/nnet3/train/dropout_schedule.py b/egs/wsj/s5/steps/libs/nnet3/train/dropout_schedule.py index 0ad93e5977d..f94d95136c4 100644 --- a/egs/wsj/s5/steps/libs/nnet3/train/dropout_schedule.py +++ b/egs/wsj/s5/steps/libs/nnet3/train/dropout_schedule.py @@ -210,6 +210,17 @@ def _get_dropout_proportions(dropout_schedule, data_fraction): return dropout_proportions +def get_schedule_value(schedule, data_fraction): + if schedule is None: + return 0 + proportions = _get_dropout_proportions( + schedule, data_fraction) + + assert len(proportions) == 1 + assert len(proportions[0]) == 2 and proportions[0][0] == '*' + return proportions[0][1] + + def get_dropout_edit_string(dropout_schedule, data_fraction, iter_): """Return an nnet3-copy --edits line to modify raw_model_string to set dropout proportions according to dropout_proportions. diff --git a/egs/wsj/s5/steps/nnet3/chain/train.py b/egs/wsj/s5/steps/nnet3/chain/train.py index 2bb86c18459..a4b2533fd59 100755 --- a/egs/wsj/s5/steps/nnet3/chain/train.py +++ b/egs/wsj/s5/steps/nnet3/chain/train.py @@ -104,12 +104,14 @@ def get_args(): dest='left_deriv_truncate', default=None, help="Deprecated. Kept for back compatibility") - parser.add_argument("--chain.smbr-start-fraction", type=float, - dest='smbr_start_fraction', default=1.1, - help="Fraction of training at which to start LF-SMBR") - parser.add_argument("--chain.smbr-learning-rate-factor", default=1.0, - dest='smbr_learning_rate_factor', type=float, - help="Learning rate factor used for sMBR training") + parser.add_argument("--chain.smbr-factor-schedule", type=str, + dest='smbr_factor_schedule', default=None, + action=common_lib.NullstrToNoneAction, + help="Schedule for sMBR factor in LF-SMBR training.") + parser.add_argument("--chain.mmi-factor-schedule", type=str, + dest='mmi_factor_schedule', default=None, + action=common_lib.NullstrToNoneAction, + help="Schedule for MMI factor in LF-SMBR training.") parser.add_argument("--chain.smbr-xent-regularize", default=None, dest='smbr_xent_regularize', type=float, help="Xent regularizer term used with sMBR training") @@ -174,6 +176,8 @@ def get_args(): 'required' part of the chunk is defined by the model's {left,right}-context.""") + parser.add_argument("--lang", type=str, help="Lang directory") + # General options parser.add_argument("--feat-dir", type=str, required=True, help="Directory with features used for training " @@ -263,6 +267,36 @@ def process_args(args): return [args, run_opts] +def get_silence_pdfs(args): + if args.lang is None: + return "" + + out = common_lib.get_command_stdout( + "am-info {0}/0.trans_mdl | grep transition-ids".format(args.dir)) + num_tids = int(out.split()[-1]) + + out = common_lib.get_command_stdout( + "seq -s ' ' 0 {num_tids} | ali-to-pdf " + "{dir}/0.trans_mdl ark,t:- ark,t:-" + "".format(num_tids=num_tids-1, dir=args.dir)) + pdfs = [int(x) for x in out.split()[1:]] + + out = common_lib.get_command_stdout( + "seq -s ' ' 0 {num_tids} | ali-to-phones --per-frame " + "{dir}/0.trans_mdl ark,t:- ark,t:-" + "".format(num_tids=num_tids-1, dir=args.dir)) + phones = [int(x) for x in out.split()[1:]] + + silence_phones_list = open( + "{lang}/phones/silence.int" + "".format(lang=args.lang)).readline() + silence_phones = set([int(x) for x in silence_phones_list.split(":")]) + + silence_pdfs = list(set([str(pdfs[i]) for i, ph in enumerate(phones) + if ph in silence_phones])) + return ",".join(sorted(silence_pdfs)) + + def train(args, run_opts): """ The main function for training. @@ -432,6 +466,8 @@ def train(args, run_opts): max_deriv_time_relative = \ args.deriv_truncate_margin + model_right_context + silence_pdfs = get_silence_pdfs(args) + logger.info("Training will run for {0} epochs = " "{1} iterations".format(args.num_epochs, num_iters)) @@ -466,17 +502,33 @@ def train(args, run_opts): xent_regularize = args.xent_regularize l2_regularize = args.l2_regularize - use_smbr=False - if (float(num_archives_processed) / num_archives_to_process - >= args.smbr_start_fraction): + smbr_opt = "" + smbr_factor = 0.0 + if args.smbr_factor_schedule is not None: + smbr_factor = common_train_lib.get_schedule_value( + args.smbr_factor_schedule, + float(num_archives_processed) / num_archives_to_process) + + smbr_opt += " --smbr-factor={0}".format(smbr_factor) + + if smbr_factor > 0.0: use_smbr=True - lrate *= args.smbr_learning_rate_factor xent_regularize = (args.smbr_xent_regularize if args.smbr_xent_regularize is not None else args.xent_regularize) l2_regularize = (args.smbr_l2_regularize if args.smbr_l2_regularize is not None else args.l2_regularize) + smbr_opt = "--use-smbr-objective" + if silence_pdfs is not None: + smbr_opt += " --silence-pdfs=" + silence_pdfs + + if args.mmi_factor_schedule is not None: + mmi_factor = common_train_lib.get_schedule_value( + args.mmi_factor_schedule, + float(num_archives_processed) / num_archives_to_process) + + smbr_opt += " --mmi-factor={0}".format(mmi_factor) chain_lib.train_one_iteration( dir=args.dir, @@ -507,7 +559,7 @@ def train(args, run_opts): run_opts=run_opts, backstitch_training_scale=args.backstitch_training_scale, backstitch_training_interval=args.backstitch_training_interval, - use_smbr_objective=use_smbr) + smbr_opt=smbr_opt) if args.cleanup: # do a clean up everythin but the last 2 models, under certain @@ -535,9 +587,15 @@ def train(args, run_opts): xent_regularize = args.xent_regularize l2_regularize = args.l2_regularize - use_smbr = False - if (float(num_archives_processed) / num_archives_to_process - >= args.smbr_start_fraction): + smbr_opt = "" + smbr_factor = 0.0 + if args.smbr_factor_schedule is not None: + smbr_factor = common_train_lib.get_schedule_value( + args.smbr_factor_schedule, 1.0) + + smbr_opt += " --smbr-factor={0}".format(smbr_factor) + + if smbr_factor > 0.0: use_smbr=True xent_regularize = (args.smbr_xent_regularize if args.smbr_xent_regularize is not None @@ -545,6 +603,16 @@ def train(args, run_opts): l2_regularize = (args.smbr_l2_regularize if args.smbr_l2_regularize is not None else args.l2_regularize) + smbr_opt = "--use-smbr-objective" + if silence_pdfs is not None: + smbr_opt += " --silence-pdfs=" + silence_pdfs + + if args.mmi_factor_schedule is not None: + mmi_factor = common_train_lib.get_schedule_value( + args.mmi_factor_schedule, 1.0) + + smbr_opt += " --mmi-factor={0}".format(mmi_factor) + chain_lib.combine_models( dir=args.dir, num_iters=num_iters, models_to_combine=models_to_combine, @@ -555,7 +623,7 @@ def train(args, run_opts): xent_regularize=xent_regularize, run_opts=run_opts, sum_to_one_penalty=args.combine_sum_to_one_penalty, - use_smbr_objective=use_smbr) + smbr_opt=smbr_opt) if args.cleanup: logger.info("Cleaning up the experiment directory " From e51826a245b61f390e71ef9fb037e81c2cfb0430 Mon Sep 17 00:00:00 2001 From: Pegita Date: Sat, 29 Jul 2017 17:29:19 -0400 Subject: [PATCH 031/174] fixed small issue with language-model.*. --- src/chain/language-model.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/chain/language-model.h b/src/chain/language-model.h index daeea9395b9..1109d832bf7 100644 --- a/src/chain/language-model.h +++ b/src/chain/language-model.h @@ -91,7 +91,7 @@ class LanguageModelEstimator { // Adds counts for this sentence. Basically does: for each n-gram in the // sentence, count[n-gram] += weight. The only constraint on 'sentence' is that it // should contain no zeros. - void AddCounts(const std::vector &sentence, int32 weight); + void AddCounts(const std::vector &sentence, int32 weight = 1.0); // Estimates the LM and outputs it as an FST. Note: there is // no concept here of backoff arcs. From d64d0177f8def29c62a9f85affdf2147403ae6be Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Fri, 4 Aug 2017 14:29:04 -0400 Subject: [PATCH 032/174] semisup: Updating semisupervised scripts --- .../s5/local/chain/compare_wer_general.py | 239 +++++++++++ .../s5/local/chain/compare_wer_general.sh | 63 +-- .../s5/local/chain/confidence_calibration.sh | 88 ++++ .../s5/local/chain/tuning/run_tdnn_b.sh | 198 +++++++++ .../chain/tuning/run_tdnn_semisupervised_a.sh | 145 +++++++ .../chain/tuning/run_tdnn_semisupervised_b.sh | 253 +++++++++++ .../tuning/run_tdnn_semisupervised_conf_a.sh | 331 +++++++++++++++ .../tuning/run_tdnn_semisupervised_conf_c.sh | 380 +++++++++++++++++ .../tuning/run_tdnn_semisupervised_conf_d.sh | 298 +++++++++++++ .../tuning/run_tdnn_semisupervised_conf_e.sh | 396 ++++++++++++++++++ .../tuning/run_tdnn_semisupervised_conf_f.sh | 347 +++++++++++++++ .../tuning/run_tdnn_semisupervised_conf_g.sh | 383 +++++++++++++++++ .../tuning/run_tdnn_semisupervised_conf_h.sh | 348 +++++++++++++++ .../tuning/run_tdnn_semisupervised_conf_i.sh | 339 +++++++++++++++ .../tuning/run_tdnn_semisupervised_conf_j.sh | 339 +++++++++++++++ .../tuning/run_tdnn_semisupervised_conf_k.sh | 339 +++++++++++++++ .../tuning/run_tdnn_semisupervised_conf_l.sh | 339 +++++++++++++++ .../tuning/run_tdnn_semisupervised_conf_m.sh | 339 +++++++++++++++ .../tuning/run_tdnn_semisupervised_conf_n.sh | 339 +++++++++++++++ .../tuning/run_tdnn_semisupervised_conf_o.sh | 341 +++++++++++++++ .../tuning/run_tdnn_semisupervised_conf_p.sh | 340 +++++++++++++++ .../tuning/run_tdnn_semisupervised_conf_q.sh | 340 +++++++++++++++ .../s5/local/nnet3/run_ivector_common_pca.sh | 103 +++++ .../semisup/chain/tuning/run_tdnn_11k.sh | 193 +++++++++ .../run_tdnn_11k_semisupervised_conf_a.sh | 353 ++++++++++++++++ .../s5/local/semisup/run_10k.sh | 60 +++ .../s5/local/semisup/run_15k.sh | 66 +++ egs/wsj/s5/steps/best_path_weights.sh | 160 +++++++ egs/wsj/s5/steps/conf/apply_calibration.sh | 11 +- .../s5/steps/conf/convert_ctm_to_weights.py | 101 +++++ .../s5/steps/conf/prepare_calibration_data.py | 16 +- egs/wsj/s5/steps/conf/train_calibration.sh | 13 +- egs/wsj/s5/steps/libs/common.py | 163 ++++++- .../nnet3/train/chain_objf/acoustic_model.py | 100 ++++- egs/wsj/s5/steps/libs/nnet3/train/common.py | 2 +- .../nnet3/train/frame_level_objf/common.py | 18 +- .../steps/nnet3/chain/build_tree_from_lats.sh | 201 +++++++++ .../chain/build_tree_multiple_sources.sh | 283 +++++++++++++ egs/wsj/s5/steps/nnet3/chain/get_egs.sh | 76 +++- egs/wsj/s5/steps/nnet3/chain/make_den_fst.sh | 86 ++++ egs/wsj/s5/steps/nnet3/chain/train.py | 18 +- .../allocate_multilingual_examples.py | 70 +++- .../steps/nnet3/multilingual/combine_egs.sh | 65 ++- egs/wsj/s5/steps/subset_ali_dir.sh | 48 +++ egs/wsj/s5/utils/queue.pl | 8 + src/chain/chain-supervision.cc | 7 +- src/chain/language-model.cc | 11 +- src/chain/language-model.h | 4 +- src/chainbin/chain-est-phone-lm.cc | 39 +- src/chainbin/nnet3-chain-copy-egs.cc | 86 +++- src/chainbin/nnet3-chain-get-egs.cc | 90 +++- src/chainbin/nnet3-chain-normalize-egs.cc | 12 + src/nnet3/nnet-chain-combine.cc | 16 +- src/nnet3/nnet-chain-diagnostics.cc | 38 +- src/nnet3/nnet-chain-diagnostics.h | 3 + src/nnet3/nnet-chain-example.cc | 14 +- src/nnet3/nnet-example-utils.cc | 21 +- src/nnet3/nnet-example-utils.h | 5 +- 58 files changed, 8901 insertions(+), 183 deletions(-) create mode 100755 egs/fisher_english/s5/local/chain/compare_wer_general.py create mode 100755 egs/fisher_english/s5/local/chain/confidence_calibration.sh create mode 100644 egs/fisher_english/s5/local/chain/tuning/run_tdnn_b.sh create mode 100755 egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_a.sh create mode 100755 egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_b.sh create mode 100644 egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_a.sh create mode 100644 egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_c.sh create mode 100644 egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_d.sh create mode 100644 egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_e.sh create mode 100644 egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_f.sh create mode 100644 egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_g.sh create mode 100644 egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_h.sh create mode 100644 egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_i.sh create mode 100644 egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_j.sh create mode 100644 egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_k.sh create mode 100644 egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_l.sh create mode 100644 egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_m.sh create mode 100644 egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_n.sh create mode 100644 egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_o.sh create mode 100644 egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_p.sh create mode 100644 egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_q.sh create mode 100755 egs/fisher_english/s5/local/nnet3/run_ivector_common_pca.sh create mode 100755 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k.sh create mode 100644 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_a.sh create mode 100644 egs/fisher_english/s5/local/semisup/run_10k.sh create mode 100644 egs/fisher_english/s5/local/semisup/run_15k.sh create mode 100755 egs/wsj/s5/steps/best_path_weights.sh create mode 100755 egs/wsj/s5/steps/conf/convert_ctm_to_weights.py create mode 100755 egs/wsj/s5/steps/nnet3/chain/build_tree_from_lats.sh create mode 100755 egs/wsj/s5/steps/nnet3/chain/build_tree_multiple_sources.sh create mode 100755 egs/wsj/s5/steps/nnet3/chain/make_den_fst.sh create mode 100755 egs/wsj/s5/steps/subset_ali_dir.sh diff --git a/egs/fisher_english/s5/local/chain/compare_wer_general.py b/egs/fisher_english/s5/local/chain/compare_wer_general.py new file mode 100755 index 00000000000..e3a2dc5417a --- /dev/null +++ b/egs/fisher_english/s5/local/chain/compare_wer_general.py @@ -0,0 +1,239 @@ +#! /usr/bin/env python + +import argparse +import collections +import os +import re +import sys + +sys.path.insert(0, 'steps') +import libs.common as common_lib + +from collections import defaultdict + +def get_args(): + parser = argparse.ArgumentParser( + description=""" +This script is used for comparing decoding results between systems. +e.g. local/chain/compare_wer_general.py exp/chain_cleaned/tdnn_{c,d}_sp +For use with discriminatively trained systems you specify the epochs after a colon: +for instance, +local/chain/compare_wer_general.sh exp/chain_cleaned/tdnn_c_sp exp/chain_cleaned/tdnn_c_sp_smbr:{1,2,3} +""") + + parser.add_argument("--separator", type=str, default=" ", + help="Separator for different fields") + parser.add_argument("--print-fine-details", action='store_true', + help="Add fine details of insertions, substitutions " + "and deletions.") + parser.add_argument("--include-looped", action='store_true', + help="Used to include looped results") + parser.add_argument("--field-size", type=int, + help="Field size for the models") + parser.add_argument("systems", nargs='+') + + args = parser.parse_args() + return args + + +def parse_system_string(system_string): + parts = system_string.split(":") + if len(parts) not in [1, 2, 3]: + raise RuntimeError("Unable to parse system string {0}" + "".format(system_string)) + + dir_name = parts[0] + + suffix = "" + if len(parts) > 1: + suffix = parts[1] + + model_name = os.path.basename(dir_name) + if len(parts) > 2: + model_name = parts[2] + + return (dir_name, suffix, model_name) + + +class SystemInfo(object): + def __init__(self, dir_name, suffix, model_name): + self.dir_name = dir_name + self.suffix = suffix + self.model_name = model_name + self.iter_ = "final" + + if self.suffix != "": + m = re.search("_iter(\d+)", suffix) + if bool(m): + self.iter_ = m.group(1) + else: + used_epochs = False + + self.probs = [] + self.wers = defaultdict(lambda: "NA") + self.ins = defaultdict(lambda: "NA") + self.dels = defaultdict(lambda: "NA") + self.sub = defaultdict(lambda: "NA") + + def add_wer(self, dev_set, affix=""): + decode_name = dev_set + self.suffix + + out = common_lib.get_command_stdout( + "grep WER {dir_name}/decode{affix}_{decode_name}/wer* | utils/best_wer.sh" + "".format(dir_name=self.dir_name, affix=affix, + decode_name=decode_name), + require_zero_status=False) + + if out != "" and len(out.split()) >= 2: + self.wers[(dev_set, affix)] = out.split()[1] + self.ins[(dev_set, affix)] = out.split()[6] + self.dels[(dev_set, affix)] = out.split()[8] + self.sub[(dev_set, affix)] = out.split()[10] + + def _get_prob(self, set_="train", xent=False): + + if not os.path.exists( + "{dir_name}/log/compute_prob_{set}.{iter}.log" + "".format(dir_name=self.dir_name, set=set_, iter=self.iter_)): + return "NA" + + out = common_lib.get_command_stdout( + "grep Overall {dir_name}/log/compute_prob_{set}.{iter}.log | " + "grep {opt} xent".format(dir_name=self.dir_name, set=set_, + iter=self.iter_, + opt="-w" if xent else "-v"), + require_zero_status=False) + + if out == "": + return "NA" + + lines = out.split("\n") + prob = None + + affix = "-xent" if xent else "" + for line in lines: + if (bool(re.search(r"'output-0{0}'".format(affix), line)) + or bool(re.search(r"'output{0}'".format(affix), line))): + prob = float(line.split()[7]) + break + + return "NA" if prob is None else "{0:.4f}".format(prob) + + def add_probs(self): + self.probs.append(self._get_prob(set_="train", xent=False)) + self.probs.append(self._get_prob(set_="valid", xent=False)) + self.probs.append(self._get_prob(set_="train", xent=True)) + self.probs.append(self._get_prob(set_="valid", xent=True)) + + +def run(args): + used_epochs = False + systems = [] + for sys_string in args.systems: + dir_name, suffix, model_name = parse_system_string(sys_string) + info = SystemInfo(dir_name, suffix, model_name) + + if suffix != "" and re.search(suffix, "epoch"): + used_epochs = True + else: + used_epochs = False + + for dev_set in ["dev", "test"]: + info.add_wer(dev_set) + + if args.include_looped: + info.add_wer(dev_set, affix="_looped") + + if not used_epochs: + info.add_probs() + + systems.append(info) + + print_system_infos(args, systems, used_epochs) + + +def print_system_infos(args, system_infos, used_epochs=False): + field_sizes = [args.field_size] * len(system_infos) + + if args.field_size is None: + for i, x in enumerate(system_infos): + field_sizes[i] = len(x.model_name) + + separator = args.separator + print ("# {0: <25}{sep}{1}".format( + "System", + "{sep}".format(sep=args.separator).join( + ["{0: <{1}}".format(x.model_name, field_sizes[i]) + for i, x in enumerate(system_infos)]), + sep=args.separator)) + + tups = set() + for sys_info in system_infos: + for tup in sys_info.wers: + tups.add(tup) + + for tup in sorted(list(tups)): + dev_set, affix = tup + print ("# {0: <25}{sep}{1}".format( + "WER on {0} {1}" + "".format(dev_set, "[ "+affix+" ]" if affix != "" else ""), + "{sep}".format(sep=args.separator).join( + ["{0: <{1}}".format(x.wers[tup], field_sizes[i]) + for i, x in enumerate(system_infos)]), + sep=args.separator)) + if args.print_fine_details: + print ("# {0: <25}{sep}{1}".format( + "#Ins on {0} {1}" + "".format(dev_set, "[ "+affix+" ]" if affix != "" else ""), + "{sep}".format(sep=args.separator).join( + ["{0: <{1}}".format(x.ins[tup], field_sizes[i]) + for i, x in enumerate(system_infos)]), + sep=args.separator)) + print ("# {0: <25}{sep}{1}".format( + "#Del on {0} {1}" + "".format(dev_set, "[ "+affix+" ]" if affix != "" else ""), + "{sep}".format(sep=args.separator).join( + ["{0: <{1}}".format(x.dels[tup], field_sizes[i]) + for i, x in enumerate(system_infos)]), + sep=args.separator)) + print ("# {0: <25}{sep}{1}".format( + "#Sub on {0} {1}" + "".format(dev_set, "[ "+affix+" ]" if affix != "" else ""), + "{sep}".format(sep=args.separator).join( + ["{0: <{1}}".format(x.sub[tup], field_sizes[i]) + for i, x in enumerate(system_infos)]), + sep=args.separator)) + + if not used_epochs: + print ("# {0: <25}{sep}{1}".format( + "Final train prob", + "{sep}".format(sep=args.separator).join( + ["{0: <{1}}".format(x.probs[0], field_sizes[i]) + for i, x in enumerate(system_infos)]), + sep=args.separator)) + + print ("# {0: <25}{sep}{1}".format( + "Final valid prob", + "{sep}".format(sep=args.separator).join( + ["{0: <{1}}".format(x.probs[1], field_sizes[i]) + for i, x in enumerate(system_infos)]), + sep=args.separator)) + + print ("# {0: <25}{sep}{1}".format( + "Final train prob (xent)", + "{sep}".format(sep=args.separator).join( + ["{0: <{1}}".format(x.probs[2], field_sizes[i]) + for i, x in enumerate(system_infos)]), + sep=args.separator)) + + print ("# {0: <25}{sep}{1}".format( + "Final valid prob (xent)", + "{sep}".format(sep=args.separator).join( + ["{0: <{1}}".format(x.probs[3], field_sizes[i]) + for i, x in enumerate(system_infos)]), + sep=args.separator)) + + +if __name__ == "__main__": + args = get_args() + run(args) diff --git a/egs/fisher_english/s5/local/chain/compare_wer_general.sh b/egs/fisher_english/s5/local/chain/compare_wer_general.sh index 2f724c8ff81..4a7d2e87bd9 100755 --- a/egs/fisher_english/s5/local/chain/compare_wer_general.sh +++ b/egs/fisher_english/s5/local/chain/compare_wer_general.sh @@ -10,10 +10,8 @@ echo "# $0 $*" include_looped=false -if [ "$1" == "--looped" ]; then - include_looped=true - shift -fi + +. utils/parse_options.sh used_epochs=false @@ -21,10 +19,10 @@ used_epochs=false # [for discriminative training] and the regular parts of the name. # If called with a colon-free directory name, like: # set_names exp/chain_cleaned/tdnn_lstm1e_sp_bi_smbr -# it will set dir=exp/chain_cleaned/tdnn_lstm1e_sp_bi_smbr and epoch_infix="" +# it will set dir=exp/chain_cleaned/tdnn_lstm1e_sp_bi_smbr and suffix="" # If called with something like: # set_names exp/chain_cleaned/tdnn_d_sp_smbr:3 -# it will set dir=exp/chain_cleaned/tdnn_d_sp_smbr and epoch_infix="_epoch3" +# it will set dir=exp/chain_cleaned/tdnn_d_sp_smbr and suffix="_epoch3" set_names() { @@ -33,37 +31,42 @@ set_names() { exit 1 # exit the program fi dirname=$(echo $1 | cut -d: -f1) - epoch=$(echo $1 | cut -s -d: -f2) - if [ -z $epoch ]; then - epoch_infix="" - else + suffix=$(echo $1 | cut -s -d: -f2) + model_name=$(echo $1 | cut -s -d: -f3) + if [ ! -z "$suffix" ] && [[ $suffix =~ *epoch* ]]; then used_epochs=true - epoch_infix=_epoch${epoch} + else + used_epochs=false + fi + if [ -z "$model_name" ]; then + model_name=$(basename $dirname) fi } -echo -n "# System " -for x in $*; do printf "% 10s" " $(basename $x)"; done +printf "# System %14s" "" +for x in $*; do + set_names $x # sets $dirname and $suffix + printf "% 10s" " $model_name"; done echo -strings=("# WER on dev " "# WER on test ") +strings=("WER on dev " "WER on test ") for n in 0 1; do - echo -n "${strings[$n]}" + printf "# %s % 6s" "${strings[$n]}" "" for x in $*; do - set_names $x # sets $dirname and $epoch_infix - decode_names=(dev${epoch_infix} test${epoch_infix}) + set_names $x # sets $dirname and $suffix + decode_names=(dev${suffix} test${suffix}) wer=$(grep WER $dirname/decode_${decode_names[$n]}/wer* | utils/best_wer.sh | awk '{print $2}') printf "% 10s" $wer done echo if $include_looped; then - echo -n "# [looped:] " + printf "# % 20s" " [looped:]" for x in $*; do - set_names $x # sets $dirname and $epoch_infix - decode_names=(dev${epoch_infix} test${epoch_infix}) + set_names $x # sets $dirname and $suffix + decode_names=(dev${suffix} test${suffix}) wer=$(grep WER $dirname/decode_looped_${decode_names[$n]}/wer* | utils/best_wer.sh | awk '{print $2}') printf "% 10s" $wer done @@ -76,30 +79,34 @@ if $used_epochs; then exit 0; # the diagnostics aren't comparable between regular and discriminatively trained systems. fi -echo -n "# Final train prob " +printf "# % 20s" "Final train prob " for x in $*; do - prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -v xent | awk '{printf("%.4f", $8)}') + set_names $x # sets $dirname and $suffix + prob=$(grep Overall $dirname/log/compute_prob_train.final.log | grep -v xent | awk '{printf("%.4f ", $8)}') printf "% 10s" $prob done echo -echo -n "# Final valid prob " +printf "# % 20s" "Final valid prob " for x in $*; do - prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -v xent | awk '{printf("%.4f", $8)}') + set_names $x # sets $dirname and $suffix + prob=$(grep Overall $dirname/log/compute_prob_valid.final.log | grep -v xent | awk '{printf("%.4f ", $8)}') printf "% 10s" $prob done echo -echo -n "# Final train prob (xent)" +printf "# % 20s" "Final train prob (xent)" for x in $*; do - prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -w xent | awk '{printf("%.4f", $8)}') + set_names $x # sets $dirname and $suffix + prob=$(grep Overall $dirname/log/compute_prob_train.final.log | grep -w xent | awk '{printf("%.4f ", $8)}') printf "% 10s" $prob done echo -echo -n "# Final valid prob (xent)" +printf "# % 20s" "Final valid prob (xent)" for x in $*; do - prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -w xent | awk '{printf("%.4f", $8)}') + set_names $x # sets $dirname and $suffix + prob=$(grep Overall $dirname/log/compute_prob_valid.final.log | grep -w xent | awk '{printf("%.4f ", $8)}') printf "% 10s" $prob done diff --git a/egs/fisher_english/s5/local/chain/confidence_calibration.sh b/egs/fisher_english/s5/local/chain/confidence_calibration.sh new file mode 100755 index 00000000000..34a487085aa --- /dev/null +++ b/egs/fisher_english/s5/local/chain/confidence_calibration.sh @@ -0,0 +1,88 @@ +#!/bin/bash +. cmd.sh +. path.sh + +chaindir=exp/chain_semi350k_conf/tdnn_xxsup1a_sp +arpa_gz=data/local/lm_ex250k/3gram-mincount/lm_unpruned.gz +graph_affix=_ex250k +decode_affix= +train_set=train_sup_5k_calib_train +dev_set=dev_sup_5k_calib_dev + +. utils/parse_options.sh + +set -euxo pipefail + +train_data=data/${train_set}_hires +dev_data=data/${dev_set}_hires + +decode_affix=${decode_affix}${graph_affix} +graphdir=$chaindir/graph${graph_affix} +train_caldir=$chaindir/decode_${train_set}${decode_affix}/confidence +dev_caldir=$chaindir/decode_${dev_set}${decode_affix}/confidence + +###### Data preparation, + +# Prepare filtering for excluding data from train-set (1 .. keep word, 0 .. exclude word), +# - only excludes from training-targets, the confidences are recalibrated for all the words, +word_filter=$(mktemp) +awk '{ keep_the_word = $1 !~ /^(\[.*\]|<.*>|%.*|!.*|-.*|.*-)$/; print $0, keep_the_word }' \ + $graphdir/words.txt >$word_filter + +# Calcualte the word-length, +word_length=$(mktemp) +awk '{if(r==0) { len_hash[$1] = NF-2; } + if(r==1) { if(len_hash[$1]) { len = len_hash[$1]; } else { len = -1 } + print $0, len; }}' \ + r=0 $graphdir/phones/align_lexicon.txt \ + r=1 $graphdir/words.txt \ + >$word_length + +# Extract unigrams, +unigrams=$(mktemp); steps/conf/parse_arpa_unigrams.py $graphdir/words.txt $arpa_gz $unigrams + +###### Paste the 'word-specific' features (first 4 columns have fixed position, more feature-columns can be added), +# Format: "word word_id filter length other_features" +word_feats=$(mktemp) +paste $word_filter <(awk '{ print $3 }' $word_length) <(awk '{ print $3 }' $unigrams) > $word_feats + + +###### Train the calibration, +steps/conf/train_calibration.sh --cmd "$decode_cmd" --lmwt 10 \ + $train_data $graphdir $word_feats \ + $chaindir/decode_${train_set}${decode_affix} $train_caldir + +###### Apply the calibration to eval set, +steps/conf/apply_calibration.sh --cmd "$decode_cmd" \ + $dev_data $graphdir $chaindir/decode_${dev_set}${decode_affix} \ + $train_caldir $dev_caldir +# The final confidences are here '$eval_caldir/ctm_calibrated', + +exit 0 + +###### Sclite scoring, +# We will produce NCE which shows the ``quality'' of the confidences. +# Please compare with the default scoring script for your database. + +# Scoring tools, +hubscr=$KALDI_ROOT/tools/sctk/bin/hubscr.pl +hubdir=`dirname $hubscr` + +# Inputs, +ctm=$eval_caldir/ctm_calibrated +stm=$eval_data/stm +glm=$eval_data/glm + +# Normalizng CTM, just like in 'local/score_sclite.sh', +cat $ctm | grep -i -v -E '\[NOISE|LAUGHTER|VOCALIZED-NOISE\]' | \ + grep -i -v -E '' | \ + grep -i -v -E ' (UH|UM|EH|MM|HM|AH|HUH|HA|ER|OOF|HEE|ACH|EEE|EW) ' | \ + awk '$5 !~ /^.*-$/' | \ + local/map_acronyms_ctm.py -M data/local/dict_nosp/acronyms.map -i - -o ${ctm}.filt + +# Mapping the time info to global, +utils/convert_ctm.pl $eval_data/segments $eval_data/reco2file_and_channel <${ctm}.filt >${ctm}.filt.conv + +# Scoring, +$hubscr -p $hubdir -V -l english -h hub5 -g $glm -r $stm ${ctm}.filt.conv + diff --git a/egs/fisher_english/s5/local/chain/tuning/run_tdnn_b.sh b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_b.sh new file mode 100644 index 00000000000..6254dd5d184 --- /dev/null +++ b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_b.sh @@ -0,0 +1,198 @@ +#!/bin/bash +set -e + +# Based on run_tdnn_7b.sh in the fisher swbd recipe + +# configs for 'chain' +stage=0 +tdnn_affix=7b +train_stage=-10 +get_egs_stage=-10 +decode_iter= +train_set=train +tree_affix= +nnet3_affix= +gmm=tri5a +xent_regularize=0.1 +hidden_dim=725 + +# training options +num_epochs=4 +remove_egs=false +common_egs_dir= +minibatch_size=128 + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate -1 \ + --cmd "$train_cmd" 11000 $build_tree_train_data_dir $lang $build_tree_ali_dir $treedir || exit 1; +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0 include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output-1 include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$common_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize 0.1 \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +decode_suff= +if [ $stage -le 15 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}${decode_suff} || exit 1; + ) & + done +fi +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_a.sh b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_a.sh new file mode 100755 index 00000000000..c5e0401c3e5 --- /dev/null +++ b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_a.sh @@ -0,0 +1,145 @@ +#!/bin/bash + +set -e -o pipefail + +stage=-2 +nj=30 +decode_nj=30 +base_train_set=train_comb350k # for reference + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup +semi_affix=350k # affix relating train-set splitting proportion + +tdnn_affix=_sup1a # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# combination options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +comb_affix=_comb1a # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +unsup_egs_weight=1.0 +lattice_lm_scale=0.0 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=2.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +left_tolerance=2 +right_tolerance=2 +train_combined_opts="--num-epochs 4.5" +graph_affix= # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= +# to tune: +# frames_per_eg for unsupervised + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_semi${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} + +if ! cuda-compiled; then + cat <$n1?$n2:$n1)) + num_archives=$[num_archives*3/2] + mkdir -p $comb_egs_dir/log + cp {$sup_egs_dir,$comb_egs_dir}/train_diagnostic.cegs + cp {$sup_egs_dir,$comb_egs_dir}/valid_diagnostic.cegs + cp {$sup_egs_dir,$comb_egs_dir}/combine.cegs + cp {$sup_egs_dir,$comb_egs_dir}/cmvn_opts + cp -r $sup_egs_dir/info $comb_egs_dir + echo $num_archives > $comb_egs_dir/info/num_archives + cat {$sup_egs_dir,$unsup_egs_dir}/info/num_frames | awk '{s+=$1} END{print s}' > $comb_egs_dir/info/num_frames + cat {$sup_egs_dir,$unsup_egs_dir}/info/egs_per_archive | awk '{s+=$1} END{print s}' > $comb_egs_dir/info/egs_per_archive + out_egs_list= + egs_list= + for n in $(seq $num_archives); do + [ -f $sup_egs_dir/cegs.$n.ark ] && egs_list="$egs_list $sup_egs_dir/cegs.$n.ark" + [ -f $unsup_egs_dir/cegs.$n.ark ] && egs_list="$egs_list $unsup_egs_dir/cegs.$n.ark" + out_egs_list="$out_egs_list ark:$comb_egs_dir/cegs.$n.ark" + done + srand=0 + $decode_cmd $comb_egs_dir/log/combine.log \ + nnet3-chain-copy-egs "ark:cat $egs_list|" $out_egs_list +fi + +if [ $stage -le 3 ]; then + echo "$0: training on the supervised+unsupervised subset" + # the train-set and gmm do not matter as we are providing the egs + local/chain/run_tdnn.sh --stage 12 --remove-egs false --train-set $supervised_set \ + --nnet3-affix $nnet3_affix \ + --tdnn-affix ${tdnn_affix}${decode_affix}${egs_affix}${comb_affix} \ + --common-egs-dir $comb_egs_dir $train_combined_opts +fi diff --git a/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_b.sh b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_b.sh new file mode 100755 index 00000000000..0c12140c8c7 --- /dev/null +++ b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_b.sh @@ -0,0 +1,253 @@ +#!/bin/bash + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=30 +decode_nj=30 +base_train_set=train_comb350k # for reference + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup +semi_affix=350k # affix relating train-set splitting proportion + +tdnn_affix=_xxsup1a # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix=_ex250k +egs_affix=_prun2_lmwt0_tol2 # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.0 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=2.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +left_tolerance=2 +right_tolerance=2 +graph_affix= # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +common_egs_dir= + +# Semi-supervised options +comb_affix=_comb1b2 # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +tree_affix= +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 + +decode_iter= +# to tune: +# frames_per_eg for unsupervised + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_semi${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output-1 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + output-layer name=output-1-xent input=prefinal-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ + cp $dir/configs/final.config{,.orig} + + cat $dir/configs/final.config.orig | \ + perl -pe 's/component=output-1.affine/component=output-0.affine/g; + s/component=output-1-xent.affine/component=output-0-xent.affine/g;' > \ + $dir/configs/final.config +fi + +if [ $stage -le 13 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$common_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize 0.1 \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_sp_hires \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +decode_suff= +if [ $stage -le 15 ]; then + iter_opts= + if [ ! -z "$decode_iter" ]; then + iter_opts=" --iter $decode_iter " + else + decode_iter=final + fi + + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output;" $dir/$decode_iter.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/$decode_iter.mdl $dir/${decode_iter}-output.mdl + + iter_opts=" --iter ${decode_iter}-output " + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}${decode_suff} || exit 1; + ) & + done +fi +wait; +exit 0; diff --git a/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_a.sh b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_a.sh new file mode 100644 index 00000000000..4a0b5f1dd26 --- /dev/null +++ b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_a.sh @@ -0,0 +1,331 @@ +#!/bin/bash + +# This script is the baseline with unsupervised egs in multilingual recipe. +# lattice_lm_scale=0.0 +# lattice_prune_beam=2.0 +# tolerance=2 +# unsup_frames_per_eg=150 +# Deriv weights: None +# Unsupervised weight: 0.5 +# Unsupervised weight for phone LM: 0 + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=train_comb350k # for reference + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup +semi_affix=350k_conf # affix relating train-set splitting proportion + +tdnn_affix=_xxsup1a # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.0 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=2.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=2 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1a # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,0.5 + +tree_affix= +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 +# to tune: +# frames_per_eg for unsupervised + +decode_iter= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_semi${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $conf_dir/weights.scp +fi + +left_context=`cat $chaindir/egs/info/left_context` +right_context=`cat $chaindir/egs/info/right_context` +left_context_initial=`cat $chaindir/egs/info/left_context_initial` +right_context_final=`cat $chaindir/egs/info/right_context_final` + +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=`cat $chaindir/egs/info/frames_per_eg` +frame_subsampling_factor=`cat $chaindir/frame_subsampling_factor` +cmvn_opts=`cat $chaindir/cmvn_opts` + +unsup_egs_dir=$chaindir/unsup_egs${decode_affix}${egs_affix} + +if [ $stage -le 9 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $conf_dir/weights.scp \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires \ + data/${unsupervised_set}_hires $chaindir \ + ${chaindir}/decode_${unsupervised_set}${decode_affix} $unsup_egs_dir +fi + +sup_egs_dir=$chaindir/egs_scp +comb_egs_dir=$chaindir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 10 ]; then + + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --samples-per-iter 10000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +treedir=exp/chain${nnet3_affix}/tree_${tree_affix} +lat_dir=exp/chain${nnet3_affix}/tri5a_${supervised_set}_sp_lats # not required since egs is given. +dir=exp/chain${nnet3_affix}/tdnn${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output-1 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + output-layer name=output-1-xent input=prefinal-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ + cp $dir/configs/final.config{,.orig} + + cat $dir/configs/final.config.orig | \ + perl -pe 's/component=output-1.affine/component=output-0.affine/g; + s/component=output-1-xent.affine/component=output-0-xent.affine/g;' > \ + $dir/configs/final.config +fi + +if [ $stage -le 13 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize 0.1 \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_sp_hires \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 15 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_c.sh b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_c.sh new file mode 100644 index 00000000000..0564bf693ab --- /dev/null +++ b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_c.sh @@ -0,0 +1,380 @@ +#!/bin/bash + +# This script is similar to _a but uses denominator FST created using +# LM estimated on supervised + unsupervised set phone sequences +# and deriv weights from calibrated confidences. +# lattice_lm_scale=0.0 +# lattice_prune_beam=2.0 +# tolerance=2 +# unsup_frames_per_eg=150 +# Deriv weights: None +# Unsupervised weight: 0.5 +# Unsupervised weight for phone LM: 2/3 + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=train_comb350k # for reference + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup +semi_affix=350k_conf # affix relating train-set splitting proportion + +tdnn_affix=_xxsup1a # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.0 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=2.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=2 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=_comb1c # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,0.5 +lm_weights=3,2 + +tree_affix= +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 +# to tune: +# frames_per_eg for unsupervised + +decode_iter= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_semi${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $conf_dir/weights.scp +fi + +left_context=`cat $chaindir/egs/info/left_context` +right_context=`cat $chaindir/egs/info/right_context` +left_context_initial=`cat $chaindir/egs/info/left_context_initial` +right_context_final=`cat $chaindir/egs/info/right_context_final` + +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=`cat $chaindir/egs/info/frames_per_eg` + +frame_subsampling_factor=1 +if [ -f $chaindir/frame_subsampling_factor ]; then + frame_subsampling_factor=`cat $chaindir/frame_subsampling_factor` +fi +cmvn_opts=`cat $chaindir/cmvn_opts` || exit 1 + +treedir=exp/chain${nnet3_affix}/tree_${tree_affix} +dir=exp/chain${nnet3_affix}/tdnn${tdnn_affix}${decode_affix}${egs_affix}${comb_affix} + +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +if [ $stage -le 9 ]; then + false && $decode_cmd JOB=1:$(cat $unsup_lat_dir/num_jobs) \ + ${chaindir}/best_path_${unsupervised_set}${decode_affix}/log/get_best_path.JOB.log \ + lattice-best-path --acoustic-scale=1.0 \ + "ark:gunzip -c $unsup_lat_dir/lat.JOB.gz |" ark:/dev/null \ + "ark:| gzip -c > ${chaindir}/best_path_${unsupervised_set}${decode_affix}/ali.JOB.gz" + + steps/nnet3/chain/make_den_fst.sh --weights $lm_weights \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +supervised_set=${supervised_set}_sp +sup_lat_dir=exp/chain${nnet3_affix}/tri5a_${supervised_set}_lats +sup_egs_dir=$dir/egs_${supervised_set} + +if [ $stage -le 10 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $(cat $chaindir/egs/info/frames_per_eg) \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir +fi + +unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + +if [ $stage -le 11 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $conf_dir/weights.scp \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires \ + --generate-egs-scp true \ + data/${unsupervised_set}_hires $chaindir \ + $unsup_lat_dir $unsup_egs_dir +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 12 ]; then + + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --samples-per-iter 10000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $stage -le 13 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output-1 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + output-layer name=output-1-xent input=prefinal-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ + cp $dir/configs/final.config{,.orig} + + cat $dir/configs/final.config.orig | \ + perl -pe 's/component=output-1.affine/component=output-0.affine/g; + s/component=output-1-xent.affine/component=output-0-xent.affine/g;' > \ + $dir/configs/final.config +fi + +if [ $train_stage -le -2 ]; then + train_stage=-2 +fi + +if [ $stage -le 14 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize 0.1 \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 15 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 16 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_d.sh b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_d.sh new file mode 100644 index 00000000000..572a3f8466e --- /dev/null +++ b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_d.sh @@ -0,0 +1,298 @@ +#!/bin/bash + +# This script is similar to _a but uses deriv weights from lattice-posteriors. +# lattice_lm_scale=0.0 +# lattice_prune_beam=2.0 +# tolerance=2 +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posteriors (Bug when originally run) +# Unsupervised weight: 0.5 +# Unsupervised weight for phone LM: 0 + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=train_comb350k # for reference + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup +semi_affix=350k_conf # affix relating train-set splitting proportion + +tdnn_affix=_xxsup1a # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.0 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=2.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=2 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1d # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,0.5 + +tree_affix= +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 +# to tune: +# frames_per_eg for unsupervised + +decode_iter= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_semi${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output-1 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + output-layer name=output-1-xent input=prefinal-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ + cp $dir/configs/final.config{,.orig} + + cat $dir/configs/final.config.orig | \ + perl -pe 's/component=output-1.affine/component=output-0.affine/g; + s/component=output-1-xent.affine/component=output-0-xent.affine/g;' > \ + $dir/configs/final.config +fi + +if [ $stage -le 13 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize 0.1 \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 15 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_e.sh b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_e.sh new file mode 100644 index 00000000000..24734d216e2 --- /dev/null +++ b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_e.sh @@ -0,0 +1,396 @@ +#!/bin/bash + +# This script is similar to _c but re-creates supervised egs using new +# normalization FST. +# lattice_lm_scale=0.0 +# lattice_prune_beam=2.0 +# tolerance=2 +# unsup_frames_per_eg=150 +# Deriv weights: None +# Unsupervised weight: 0.5 +# Unsupervised weight for phone LM: 2/3 + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=train_comb350k # for reference + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup +semi_affix=350k_conf # affix relating train-set splitting proportion + +tdnn_affix=_xxsup1a # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.0 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=2.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=2 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +sup_egs_dir= +comb_affix=comb1e # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,0.5 +lm_weights=3,2 + +tree_affix= +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 +# to tune: +# frames_per_eg for unsupervised + +decode_iter= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_semi${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $conf_dir/weights.scp +fi + +left_context=`cat $chaindir/egs/info/left_context` +right_context=`cat $chaindir/egs/info/right_context` +left_context_initial=`cat $chaindir/egs/info/left_context_initial` +right_context_final=`cat $chaindir/egs/info/right_context_final` + +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=`cat $chaindir/egs/info/frames_per_eg` + +frame_subsampling_factor=1 +if [ -f $chaindir/frame_subsampling_factor ]; then + frame_subsampling_factor=`cat $chaindir/frame_subsampling_factor` +fi +cmvn_opts=`cat $chaindir/cmvn_opts` || exit 1 + +treedir=exp/chain${nnet3_affix}/tree_${tree_affix} +dir=exp/chain${nnet3_affix}/tdnn${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} + +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +if [ $stage -le 9 ]; then + false && $decode_cmd JOB=1:$(cat $unsup_lat_dir/num_jobs) \ + ${chaindir}/best_path_${unsupervised_set}${decode_affix}/log/get_best_path.JOB.log \ + lattice-best-path --acoustic-scale=1.0 \ + "ark:gunzip -c $unsup_lat_dir/lat.JOB.gz |" ark:/dev/null \ + "ark:| gzip -c > ${chaindir}/best_path_${unsupervised_set}${decode_affix}/ali.JOB.gz" + + steps/nnet3/chain/make_den_fst.sh --weights $lm_weights \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +supervised_set=${supervised_set}_sp +sup_lat_dir=exp/chain${nnet3_affix}/tri5a_${supervised_set}_lats + +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + + left_context=`cat $chaindir/egs/info/left_context` + right_context=`cat $chaindir/egs/info/right_context` + left_context_initial=`cat $chaindir/egs/info/left_context_initial` + right_context_final=`cat $chaindir/egs/info/right_context_final` + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 10 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + left_context=`cat $sup_egs_dir/info/left_context` + right_context=`cat $sup_egs_dir/info/right_context` + left_context_initial=`cat $sup_egs_dir/info/left_context_initial` + right_context_final=`cat $sup_egs_dir/info/right_context_final` + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg +unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + +if [ $stage -le 11 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $conf_dir/weights.scp \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires \ + --generate-egs-scp true \ + data/${unsupervised_set}_hires $chaindir \ + $unsup_lat_dir $unsup_egs_dir +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 12 ]; then + + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --samples-per-iter 10000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $stage -le 13 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output-1 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + output-layer name=output-1-xent input=prefinal-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ + cp $dir/configs/final.config{,.orig} + + cat $dir/configs/final.config.orig | \ + perl -pe 's/component=output-1.affine/component=output-0.affine/g; + s/component=output-1-xent.affine/component=output-0-xent.affine/g;' > \ + $dir/configs/final.config +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 14 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 15 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 16 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_f.sh b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_f.sh new file mode 100644 index 00000000000..faef0c70546 --- /dev/null +++ b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_f.sh @@ -0,0 +1,347 @@ +#!/bin/bash + +# This script is similar to _e but uses deriv weights from lattice-posteriors +# instead of from calibrated confidences. +# But there is a minor bug in creating the lattice posteriors when this +# script was run. An acwt of 1.0 was used for lattice-best-path when it +# should have been 0.1. +# lattice_lm_scale=0.0 +# lattice_prune_beam=2.0 +# tolerance=2 +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posteriors (Bug when originally run) +# Unsupervised weight: 0.5 +# Unsupervised weight for phone LM: 2/3 + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=train_comb350k # for reference + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup +semi_affix=350k_conf # affix relating train-set splitting proportion + +tdnn_affix=_xxsup1a # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.0 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=2.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=2 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +sup_egs_dir= +comb_affix=comb1f # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,0.5 +lm_weights=3,2 + +tree_affix= +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 +# to tune: +# frames_per_eg for unsupervised + +decode_iter= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_semi${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output-1 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + output-layer name=output-1-xent input=prefinal-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ + cp $dir/configs/final.config{,.orig} + + cat $dir/configs/final.config.orig | \ + perl -pe 's/component=output-1.affine/component=output-0.affine/g; + s/component=output-1-xent.affine/component=output-0-xent.affine/g;' > \ + $dir/configs/final.config +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 14 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 15 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 16 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_g.sh b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_g.sh new file mode 100644 index 00000000000..9dbca030174 --- /dev/null +++ b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_g.sh @@ -0,0 +1,383 @@ +#!/bin/bash + +# This script is same as _e but uses a weight of 1.0 for unsupervised egs. +# lattice_lm_scale=0.0 +# lattice_prune_beam=2.0 +# tolerance=2 +# unsup_frames_per_eg=150 +# Deriv weights: None +# Unsupervised weight: 1.0 +# Unsupervised weight for phone LM: 2/3 + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=train_comb350k # for reference + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup +semi_affix=350k_conf # affix relating train-set splitting proportion + +tdnn_affix=_xxsup1a # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.0 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=2.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=2 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1g # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=3,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix= + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 +# to tune: +# frames_per_eg for unsupervised + +decode_iter= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_semi${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $conf_dir/weights.scp +fi + +frame_subsampling_factor=1 +if [ -f $chaindir/frame_subsampling_factor ]; then + frame_subsampling_factor=`cat $chaindir/frame_subsampling_factor` +fi +cmvn_opts=`cat $chaindir/cmvn_opts` || exit 1 + +treedir=exp/chain${nnet3_affix}/tree_${tree_affix} +dir=exp/chain${nnet3_affix}/tdnn${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} + +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +if [ $stage -le 9 ]; then + $decode_cmd JOB=1:$(cat $unsup_lat_dir/num_jobs) \ + ${chaindir}/best_path_${unsupervised_set}${decode_affix}/log/get_best_path.JOB.log \ + lattice-best-path --acoustic-scale=1.0 \ + "ark:gunzip -c $unsup_lat_dir/lat.JOB.gz |" ark:/dev/null \ + "ark:| gzip -c > ${chaindir}/best_path_${unsupervised_set}${decode_affix}/ali.JOB.gz" + + steps/nnet3/chain/make_den_fst.sh --weights $lm_weights \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +supervised_set=${supervised_set}_sp +sup_lat_dir=exp/chain${nnet3_affix}/tri5a_${supervised_set}_lats + +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + if [ $stage -le 10 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + left_context=`cat $sup_egs_dir/info/left_context` + right_context=`cat $sup_egs_dir/info/right_context` + left_context_initial=`cat $sup_egs_dir/info/left_context_initial` + right_context_final=`cat $sup_egs_dir/info/right_context_final` + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + if [ $stage -le 11 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $conf_dir/weights.scp \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires \ + --generate-egs-scp true \ + data/${unsupervised_set}_hires $chaindir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 12 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --samples-per-iter 10000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $stage -le 13 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output-1 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + output-layer name=output-1-xent input=prefinal-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ + cp $dir/configs/final.config{,.orig} + + cat $dir/configs/final.config.orig | \ + perl -pe 's/component=output-1.affine/component=output-0.affine/g; + s/component=output-1-xent.affine/component=output-0-xent.affine/g;' > \ + $dir/configs/final.config +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 14 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize 0.1 \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 15 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 16 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_h.sh b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_h.sh new file mode 100644 index 00000000000..866f310c0ed --- /dev/null +++ b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_h.sh @@ -0,0 +1,348 @@ +#!/bin/bash + +# This script is same as _g, but uses deriv weights from lattice posteriors +# instead of calibrated confidences. But there was a bug when running this +# script. (An acwt of 1.0 was used for lattice-best-path instead of 0.1) +# lattice_lm_scale=0.0 +# lattice_prune_beam=2.0 +# tolerance=2 +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posteriors (Bug when originally run) +# Unsupervised weight: 0.5 +# Unsupervised weight for phone LM: 2/3 + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=train_comb350k # for reference + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup +semi_affix=350k_conf # affix relating train-set splitting proportion + +tdnn_affix=_xxsup1a # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.0 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=2.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=2 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1h # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=3,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix= + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 +# to tune: +# frames_per_eg for unsupervised + +decode_iter= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_semi${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output-1 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + output-layer name=output-1-xent input=prefinal-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ + cp $dir/configs/final.config{,.orig} + + cat $dir/configs/final.config.orig | \ + perl -pe 's/component=output-1.affine/component=output-0.affine/g; + s/component=output-1-xent.affine/component=output-0-xent.affine/g;' > \ + $dir/configs/final.config +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 14 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch 128 \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 15 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 16 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_i.sh b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_i.sh new file mode 100644 index 00000000000..69e29d600c9 --- /dev/null +++ b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_i.sh @@ -0,0 +1,339 @@ +#!/bin/bash + +# This script is similar to _h, but uses unsup_frames_per_eg of 300. + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=train_comb350k # for reference + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup +semi_affix=350k_conf # affix relating train-set splitting proportion + +tdnn_affix=_xxsup1a # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg=300 # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.0 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=2.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=2 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1i # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=3,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix= + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 +# to tune: +# frames_per_eg for unsupervised + +decode_iter= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_semi${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output-1 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + output-layer name=output-1-xent input=prefinal-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ + cp $dir/configs/final.config{,.orig} + + cat $dir/configs/final.config.orig | \ + perl -pe 's/component=output-1.affine/component=output-0.affine/g; + s/component=output-1-xent.affine/component=output-0-xent.affine/g;' > \ + $dir/configs/final.config +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 14 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=128/300=64" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 15 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 16 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_j.sh b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_j.sh new file mode 100644 index 00000000000..6d98f9cf6da --- /dev/null +++ b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_j.sh @@ -0,0 +1,339 @@ +#!/bin/bash + +# This script is same as _k, but uses a weight of 0.5 for unsupervised egs. + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=train_comb350k # for reference + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup +semi_affix=350k_conf # affix relating train-set splitting proportion + +tdnn_affix=_xxsup1a # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg=300 # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.0 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=2.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=2 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1j # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,0.5 +lm_weights=3,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix= + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 +# to tune: +# frames_per_eg for unsupervised + +decode_iter= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_semi${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output-1 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + output-layer name=output-1-xent input=prefinal-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ + cp $dir/configs/final.config{,.orig} + + cat $dir/configs/final.config.orig | \ + perl -pe 's/component=output-1.affine/component=output-0.affine/g; + s/component=output-1-xent.affine/component=output-0-xent.affine/g;' > \ + $dir/configs/final.config +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 14 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=128/300=64" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 15 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 16 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_k.sh b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_k.sh new file mode 100644 index 00000000000..96d101ac2f2 --- /dev/null +++ b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_k.sh @@ -0,0 +1,339 @@ +#!/bin/bash + +# This script is same as _f, but uses an lm-scale of 0.1. + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=train_comb350k # for reference + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup +semi_affix=350k_conf # affix relating train-set splitting proportion + +tdnn_affix=_xxsup1a # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.1 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=2.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=2 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1k # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,0.5 +lm_weights=3,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix= + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 +# to tune: +# frames_per_eg for unsupervised + +decode_iter= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_semi${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output-1 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + output-layer name=output-1-xent input=prefinal-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ + cp $dir/configs/final.config{,.orig} + + cat $dir/configs/final.config.orig | \ + perl -pe 's/component=output-1.affine/component=output-0.affine/g; + s/component=output-1-xent.affine/component=output-0-xent.affine/g;' > \ + $dir/configs/final.config +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 14 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=128/300=64" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 15 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 16 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_l.sh b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_l.sh new file mode 100644 index 00000000000..371bfcfc1b6 --- /dev/null +++ b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_l.sh @@ -0,0 +1,339 @@ +#!/bin/bash + +# This script is same as _f, but uses an lm-scale of 0.5. + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=train_comb350k # for reference + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup +semi_affix=350k_conf # affix relating train-set splitting proportion + +tdnn_affix=_xxsup1a # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=2.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=2 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1l # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,0.5 +lm_weights=3,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix= + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 +# to tune: +# frames_per_eg for unsupervised + +decode_iter= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_semi${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output-1 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + output-layer name=output-1-xent input=prefinal-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ + cp $dir/configs/final.config{,.orig} + + cat $dir/configs/final.config.orig | \ + perl -pe 's/component=output-1.affine/component=output-0.affine/g; + s/component=output-1-xent.affine/component=output-0-xent.affine/g;' > \ + $dir/configs/final.config +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 14 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=128/300=64" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 15 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 16 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_m.sh b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_m.sh new file mode 100644 index 00000000000..b608e77e309 --- /dev/null +++ b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_m.sh @@ -0,0 +1,339 @@ +#!/bin/bash + +# This script is same as _f, but fixes the bug about acwt for best path. + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=train_comb350k # for reference + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup +semi_affix=350k_conf # affix relating train-set splitting proportion + +tdnn_affix=_xxsup1a # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.0 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=2.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=2 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1m # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,0.5 +lm_weights=3,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix= + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 +# to tune: +# frames_per_eg for unsupervised + +decode_iter= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_semi${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output-1 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + output-layer name=output-1-xent input=prefinal-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ + cp $dir/configs/final.config{,.orig} + + cat $dir/configs/final.config.orig | \ + perl -pe 's/component=output-1.affine/component=output-0.affine/g; + s/component=output-1-xent.affine/component=output-0-xent.affine/g;' > \ + $dir/configs/final.config +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 14 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=128/300=64" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 15 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 16 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_n.sh b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_n.sh new file mode 100644 index 00000000000..b463ed56485 --- /dev/null +++ b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_n.sh @@ -0,0 +1,339 @@ +#!/bin/bash + +# This script is same as _c, but redone to be consistent with _m. +# So it does not have any deriv weights. + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=train_comb350k # for reference + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup +semi_affix=350k_conf # affix relating train-set splitting proportion + +tdnn_affix=_xxsup1a # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.0 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=2.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=2 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1n # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,0.5 +lm_weights=3,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix= + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 +# to tune: +# frames_per_eg for unsupervised + +decode_iter= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_semi${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output-1 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + output-layer name=output-1-xent input=prefinal-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ + cp $dir/configs/final.config{,.orig} + + cat $dir/configs/final.config.orig | \ + perl -pe 's/component=output-1.affine/component=output-0.affine/g; + s/component=output-1-xent.affine/component=output-0-xent.affine/g;' > \ + $dir/configs/final.config +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 14 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=128/300=64" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 15 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 16 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_o.sh b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_o.sh new file mode 100644 index 00000000000..b4e9e1e5faf --- /dev/null +++ b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_o.sh @@ -0,0 +1,341 @@ +#!/bin/bash + +# This script is same as _a, but re-done to be consistent with _m. + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=train_comb350k # for reference + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup +semi_affix=350k_conf # affix relating train-set splitting proportion + +tdnn_affix=_xxsup1a # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.0 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=2.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=2 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1o # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,0.5 +lm_weights=1 +sup_egs_dir= +unsup_egs_dir= +tree_affix= + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 +# to tune: +# frames_per_eg for unsupervised + +decode_iter= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_semi${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output-1 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + output-layer name=output-1-xent input=prefinal-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ + cp $dir/configs/final.config{,.orig} + + cat $dir/configs/final.config.orig | \ + perl -pe 's/component=output-1.affine/component=output-0.affine/g; + s/component=output-1-xent.affine/component=output-0-xent.affine/g;' > \ + $dir/configs/final.config +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 14 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=128/300=64" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 15 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 16 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_p.sh b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_p.sh new file mode 100644 index 00000000000..7137523c843 --- /dev/null +++ b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_p.sh @@ -0,0 +1,340 @@ +#!/bin/bash + +# This script is same as _f, but fixes the bug about acwt for best path. + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=train_comb270k # for reference + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train +semi_affix=270k_conf # affix relating train-set splitting proportion + +tdnn_affix=_xxsup1a_20k # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.0 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=2.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=2 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1p # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,0.3 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix= + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 +# to tune: +# frames_per_eg for unsupervised + +decode_iter= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_semi${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output-1 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + output-layer name=output-1-xent input=prefinal-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ + cp $dir/configs/final.config{,.orig} + + cat $dir/configs/final.config.orig | \ + perl -pe 's/component=output-1.affine/component=output-0.affine/g; + s/component=output-1-xent.affine/component=output-0-xent.affine/g;' > \ + $dir/configs/final.config +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 14 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=128/300=64" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 15 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 16 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_q.sh b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_q.sh new file mode 100644 index 00000000000..cf12901f617 --- /dev/null +++ b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_q.sh @@ -0,0 +1,340 @@ +#!/bin/bash + +# This script is same as _f, but fixes the bug about acwt for best path. + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=train_comb270k # for reference + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup_20k +semi_affix=270k_conf_pca # affix relating train-set splitting proportion + +tdnn_affix=_xxsup1a_20k # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.0 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=2.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=2 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1q # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,0.3 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix= + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 +# to tune: +# frames_per_eg for unsupervised + +decode_iter= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_semi${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output-1 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + output-layer name=output-1-xent input=prefinal-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ + cp $dir/configs/final.config{,.orig} + + cat $dir/configs/final.config.orig | \ + perl -pe 's/component=output-1.affine/component=output-0.affine/g; + s/component=output-1-xent.affine/component=output-0-xent.affine/g;' > \ + $dir/configs/final.config +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 14 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=128/300=64" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 15 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 16 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/nnet3/run_ivector_common_pca.sh b/egs/fisher_english/s5/local/nnet3/run_ivector_common_pca.sh new file mode 100755 index 00000000000..e98fd479244 --- /dev/null +++ b/egs/fisher_english/s5/local/nnet3/run_ivector_common_pca.sh @@ -0,0 +1,103 @@ +#!/bin/bash + +. ./cmd.sh +set -e +stage=1 +speed_perturb=true +train_set=train +ivector_train_set=train + +nnet3_affix= + +. ./path.sh +. ./utils/parse_options.sh + +# perturbed data preparation +if [ "$speed_perturb" == "true" ]; then + if [ $stage -le 1 ]; then + # Although the nnet will be trained by high resolution data, we still have + # to perturb the normal data to get the alignments. + # _sp stands for speed-perturbed + + for datadir in ${train_set} ${ivector_train_set}; do + utils/data/perturb_data_dir_speed_3way.sh data/${datadir} data/${datadir}_sp + utils/fix_data_dir.sh data/${datadir}_sp + + mfccdir=mfcc_perturbed + steps/make_mfcc.sh --cmd "$train_cmd" --nj 50 \ + data/${datadir}_sp exp/make_mfcc/${datadir}_sp $mfccdir || exit 1; + steps/compute_cmvn_stats.sh data/${datadir}_sp exp/make_mfcc/${datadir}_sp $mfccdir || exit 1; + utils/fix_data_dir.sh data/${datadir}_sp + done + fi + train_set=${train_set}_sp +fi + +if [ $stage -le 3 ]; then + mfccdir=mfcc_hires + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then + date=$(date +'%m_%d_%H_%M') + utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/egs/fisher_english-$date/s5b/$mfccdir/storage $mfccdir/storage + fi + + for dataset in $ivector_train_set $train_set; do + utils/copy_data_dir.sh data/$dataset data/${dataset}_hires + utils/data/perturb_data_dir_volume.sh data/${dataset}_hires + + steps/make_mfcc.sh --nj 70 --mfcc-config conf/mfcc_hires.conf \ + --cmd "$train_cmd" data/${dataset}_hires exp/make_hires/$dataset $mfccdir; + steps/compute_cmvn_stats.sh data/${dataset}_hires exp/make_hires/${dataset} $mfccdir; + + # Remove the small number of utterances that couldn't be extracted for some + # reason (e.g. too short; no such file). + utils/fix_data_dir.sh data/${dataset}_hires; + done + + for dataset in test dev; do + # Create MFCCs for the eval set + utils/copy_data_dir.sh data/$dataset data/${dataset}_hires + steps/make_mfcc.sh --cmd "$train_cmd" --nj 10 --mfcc-config conf/mfcc_hires.conf \ + data/${dataset}_hires exp/make_hires/$dataset $mfccdir; + steps/compute_cmvn_stats.sh data/${dataset}_hires exp/make_hires/$dataset $mfccdir; + utils/fix_data_dir.sh data/${dataset}_hires # remove segments with problems + done +fi + +# ivector extractor training +if [ $stage -le 4 ]; then + steps/online/nnet2/get_pca_transform.sh --cmd "$train_cmd" \ + --splice-opts "--left-context=3 --right-context=3" \ + --max-utts 10000 --subsample 2 \ + data/${ivector_train_set}_hires \ + $exp/nnet3${nnet3_affix}/pca_transform +fi + +if [ $stage -le 5 ]; then + steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 --num-frames 200000 \ + data/${ivector_train_set}_hires 512 \ + $exp/nnet3${nnet3_affix}/pca_transform $exp/nnet3${nnet3_affix}/diag_ubm +fi + +if [ $stage -le 6 ]; then + steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \ + data/${ivector_train_set}_hires $exp/nnet3${nnet3_affix}/diag_ubm $exp/nnet3${nnet3_affix}/extractor || exit 1; +fi + +if [ $stage -le 7 ]; then + # We extract iVectors on all the ${train_set} data, which will be what we + # train the system on. + # having a larger number of speakers is helpful for generalization, and to + # handle per-utterance decoding well (iVector starts at zero). + steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2 data/${ivector_train_set}_hires data/${ivector_train_set}_max2_hires + + steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \ + data/${ivector_train_set}_max2_hires $exp/nnet3${nnet3_affix}/extractor $exp/nnet3${nnet3_affix}/ivectors_${ivector_train_set}_hires || exit 1; + + for dataset in test dev; do + steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \ + data/${dataset}_hires $exp/nnet3${nnet3_affix}/extractor $exp/nnet3${nnet3_affix}/ivectors_${dataset}_hires || exit 1; + done +fi + +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k.sh new file mode 100755 index 00000000000..ecbddef1b28 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k.sh @@ -0,0 +1,193 @@ +#!/bin/bash +set -e + +# Based on run_tdnn_7b.sh in the fisher swbd recipe + +# configs for 'chain' +stage=0 +tdnn_affix=7b +train_stage=-10 +get_egs_stage=-10 +decode_iter= +train_set=train_sup11k +ivector_train_set=semisup11k_250k +tree_affix= +nnet3_affix=_semi11k_250k +chain_affix=_semi11k_250k +exp=exp/semisup_11k +gmm=tri3 +xent_regularize=0.1 +hidden_dim=500 + +# training options +num_epochs=10 +remove_egs=false +common_egs_dir= +minibatch_size=128 + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate -1 \ + --cmd "$train_cmd" 4000 data/${train_set} $lang $gmm_dir $treedir || exit 1 +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn5 dim=$hidden_dim target-rms=0.5 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn5 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$common_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize 0.1 \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +decode_suff= +if [ $stage -le 15 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}${decode_suff} || exit 1; + ) & + done +fi +wait; +exit 0; diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_a.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_a.sh new file mode 100644 index 00000000000..7e81a4a985b --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_a.sh @@ -0,0 +1,353 @@ +#!/bin/bash + +# This script is same as _f, but fixes the bug about acwt for best path. + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup11k_250k # for reference +exp=exp/semisup_11k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup11k +semi_affix=semi11k_250k # affix relating train-set splitting proportion + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.0 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=2.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=2 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1a # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,0.3 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix= + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 +# to tune: +# frames_per_eg for unsupervised + +decode_iter= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +tree_affix=${tree_affix}_${semi_affix} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output-1 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + output-layer name=output-1-xent input=prefinal-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ + cp $dir/configs/final.config{,.orig} + + cat $dir/configs/final.config.orig | \ + perl -pe 's/component=output-1.affine/component=output-0.affine/g; + s/component=output-1-xent.affine/component=output-0-xent.affine/g;' > \ + $dir/configs/final.config +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 14 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=128/300=64" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 15 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 16 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/run_10k.sh b/egs/fisher_english/s5/local/semisup/run_10k.sh new file mode 100644 index 00000000000..a5a293f3ce2 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/run_10k.sh @@ -0,0 +1,60 @@ +#!/bin/bash + +# Copyright 2017 Vimal Manohar +# Apache 2.0 + +. cmd.sh +. path.sh + +stage=-1 +train_stage=-10 + +. utils/parse_options.sh + +set -o pipefail +exp=exp/semisup_11k +false && { +utils/subset_data_dir.sh --speakers data/train_sup 11000 data/train_sup11k || exit 1 +utils/subset_data_dir.sh --shortest data/train_sup11k 5000 data/train_sup11k_short || exit 1 +utils/subset_data_dir.sh data/train_sup11k 5500 data/train_sup11k_half || exit 1 + +steps/train_mono.sh --nj 10 --cmd "$train_cmd" \ + data/train_sup11k_short data/lang $exp/mono0a || exit 1 + +steps/align_si.sh --nj 30 --cmd "$train_cmd" \ + data/train_sup11k_half data/lang $exp/mono0a $exp/mono0a_ali || exit 1 + +steps/train_deltas.sh --cmd "$train_cmd" \ + 2000 10000 data/train_sup11k_half data/lang $exp/mono0a_ali $exp/tri1 || exit 1 + +(utils/mkgraph.sh data/lang_test $exp/tri1 $exp/tri1/graph + steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ + $exp/tri1/graph data/dev $exp/tri1/decode_dev)& + +steps/align_si.sh --nj 30 --cmd "$train_cmd" \ + data/train_sup11k data/lang $exp/tri1 $exp/tri1_ali || exit 1; + +steps/train_lda_mllt.sh --cmd "$train_cmd" \ + 2500 15000 data/train_sup11k data/lang $exp/tri1_ali $exp/tri2 || exit 1; + +(utils/mkgraph.sh data/lang_test $exp/tri2 $exp/tri2/graph + steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ + $exp/tri2/graph data/dev $exp/tri2/decode_dev)& + +steps/align_si.sh --nj 30 --cmd "$train_cmd" \ + data/train_sup11k data/lang $exp/tri2 $exp/tri2_ali || exit 1; + +steps/train_sat.sh --cmd "$train_cmd" \ + 2500 15000 data/train_sup11k data/lang $exp/tri2_ali $exp/tri3 || exit 1; + +( + utils/mkgraph.sh data/lang_test $exp/tri3 $exp/tri3/graph + steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ + $exp/tri3/graph data/dev $exp/tri3/decode_dev +)& + +utils/combine_data.sh data/semisup11k_250k data/train_sup11k data/train_unsup250k || exit 1 +} + +local/semisup/chain/tuning/run_tdnn_11k.sh \ + --ivector-train-set semisup11k_250k --train-set train_sup11k --stage $stage --train-stage $train_stage || exit 1 diff --git a/egs/fisher_english/s5/local/semisup/run_15k.sh b/egs/fisher_english/s5/local/semisup/run_15k.sh new file mode 100644 index 00000000000..2be45d954d6 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/run_15k.sh @@ -0,0 +1,66 @@ +#!/bin/bash + +# Copyright 2017 Vimal Manohar +# Apache 2.0 + +. cmd.sh +. path.sh + +stage=-1 +train_stage=-10 + +. utils/parse_options.sh + +set -o pipefail +exp=exp/semisup_15k + +false && { +utils/subset_data_dir.sh --speakers data/train_sup 15000 data/train_sup15k || exit 1 +utils/subset_data_dir.sh --shortest data/train_sup15k 5000 data/train_sup15k_short || exit 1 +utils/subset_data_dir.sh data/train_sup15k 7500 data/train_sup15k_half || exit 1 + +steps/train_mono.sh --nj 10 --cmd "$train_cmd" \ + data/train_sup15k_short data/lang $exp/mono0a || exit 1 + +steps/align_si.sh --nj 30 --cmd "$train_cmd" \ + data/train_sup15k_half data/lang $exp/mono0a $exp/mono0a_ali || exit 1 + +steps/train_deltas.sh --cmd "$train_cmd" \ + 2000 10000 data/train_sup15k_half data/lang $exp/mono0a_ali $exp/tri1 || exit 1 + +(utils/mkgraph.sh data/lang_test $exp/tri1 $exp/tri1/graph + steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ + $exp/tri1/graph data/dev $exp/tri1/decode_dev)& + +steps/align_si.sh --nj 30 --cmd "$train_cmd" \ + data/train_sup15k data/lang $exp/tri1 $exp/tri1_ali || exit 1; + +steps/train_lda_mllt.sh --cmd "$train_cmd" \ + 2500 15000 data/train_sup15k data/lang $exp/tri1_ali $exp/tri2 || exit 1; + +(utils/mkgraph.sh data/lang_test $exp/tri2 $exp/tri2/graph + steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ + $exp/tri2/graph data/dev $exp/tri2/decode_dev)& + +steps/align_si.sh --nj 30 --cmd "$train_cmd" \ + data/train_sup15k data/lang $exp/tri2 $exp/tri2_ali || exit 1; + +steps/train_sat.sh --cmd "$train_cmd" \ + 2500 15000 data/train_sup15k data/lang $exp/tri2_ali $exp/tri3 || exit 1; + +( + utils/mkgraph.sh data/lang_test $exp/tri3 $exp/tri3/graph + steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ + $exp/tri3/graph data/dev $exp/tri3/decode_dev +)& + +utils/combine_data.sh data/semisup15k_250k data/train_sup15k data/train_unsup250k || exit 1 +} + +local/semisup/chain/tuning/run_tdnn_11k.sh \ + --train-set train_sup15k \ + --nnet3-affix _semi15k_250k \ + --chain-affix _semi15k_250k \ + --stage $stage --train-stage $train_stage \ + --exp $exp \ + --ivector-train-set semisup15k_250k || exit 1 diff --git a/egs/wsj/s5/steps/best_path_weights.sh b/egs/wsj/s5/steps/best_path_weights.sh new file mode 100755 index 00000000000..c2e0c60f961 --- /dev/null +++ b/egs/wsj/s5/steps/best_path_weights.sh @@ -0,0 +1,160 @@ +#!/bin/bash + +# Copyright 2014-17 Vimal Manohar + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. + + +# This script combines frame-level posteriors from different decode +# directories. The first decode directory is assumed to be the primary +# and is used to get the best path. The posteriors from other decode +# directories are interpolated with the posteriors of the best path. +# The output is a new directory with final.mdl, tree from the primary +# decode-dir and the best path alignments and weights in a decode-directory +# with the same basename as the primary directory. +# This is typically used to get better posteriors for semisupervised training +# of DNN +# e.g. local/combine_posteriors.sh exp/tri6_nnet/decode_train_unt.seg +# exp/sgmm_mmi_b0.1/decode_fmllr_train_unt.seg_it4 exp/combine_dnn_sgmm +# Here the final.mdl and tree are copied from exp/tri6_nnet to +# exp/combine_dnn_sgmm. ali.*.gz obtained from the primary dir and +# the interpolated posteriors in weights.scp are placed in +# exp/combine_dnn_sgmm/decode_train_unt.seg + +set -e + +# begin configuration section. +cmd=run.pl +stage=-10 +acwt=0.1 +#end configuration section. + +help_message="Usage: "$(basename $0)" [options] [:weight] [:weight] [[:weight] ... ] + E.g. "$(basename $0)" data/train_unt.seg data/lang exp/tri1/decode:0.5 exp/tri2/decode:0.25 exp/tri3/decode:0.25 exp/combine +Options: + --cmd (run.pl|queue.pl...) # specify how to run the sub-processes. +"; + +[ -f ./path.sh ] && . ./path.sh +. parse_options.sh || exit 1; + +if [ $# -lt 4 ]; then + printf "$help_message\n"; + exit 1; +fi + +data=$1 +lang=$2 +dir=${@: -1} # last argument to the script +shift 2; +decode_dirs=( $@ ) # read the remaining arguments into an array +unset decode_dirs[${#decode_dirs[@]}-1] # 'pop' the last argument which is odir +num_sys=${#decode_dirs[@]} # number of systems to combine + +mkdir -p $dir +mkdir -p $dir/log + +decode_dir=`echo ${decode_dirs[0]} | cut -d: -f1` +nj=`cat $decode_dir/num_jobs` + +mkdir -p $dir + +if [ $stage -lt -1 ]; then + mkdir -p $dir/log + $cmd JOB=1:$nj $dir/log/best_path.JOB.log \ + lattice-best-path --acoustic-scale=$acwt \ + "ark,s,cs:gunzip -c $decode_dir/lat.JOB.gz |" \ + ark:/dev/null "ark:| gzip -c > $dir/ali.JOB.gz" || exit 1 +fi + +src_dir=`dirname $decode_dir` + +cp $src_dir/cmvn_opts $dir/ || exit 1 +for f in final.mat splice_opts frame_subsampling_factor; do + [ -f $src_dir/$f ] && cp $src_dir/$f $dir +done + +weights_sum=0.0 + +for i in `seq 0 $[num_sys-1]`; do + decode_dir=${decode_dirs[$i]} + + weight=`echo $decode_dir | cut -d: -s -f2` + [ -z "$weight" ] && weight=1.0 + + if [ $i -eq 0 ]; then + file_list="\"ark:vector-scale --scale=$weight ark:$dir/weights.$i.JOB.ark ark:- |\"" + else + file_list="$file_list \"ark,s,cs:vector-scale --scale=$weight ark:$dir/weights.$i.JOB.ark ark:- |\"" + fi + + weights_sum=`perl -e "print STDOUT $weights_sum + $weight"` +done + +inv_weights_sum=`perl -e "print STDOUT 1.0/$weights_sum"` + +fdir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $dir ${PWD}` + +for i in `seq 0 $[num_sys-1]`; do + if [ $stage -lt $i ]; then + decode_dir=`echo ${decode_dirs[$i]} | cut -d: -f1` + + model=`dirname $decode_dir`/final.mdl # model one level up from decode dir + tree=`dirname $decode_dir`/tree # tree one level up from decode dir + + for f in $model $decode_dir/lat.1.gz $tree; do + [ ! -f $f ] && echo "$0: expecting file $f to exist" && exit 1; + done + if [ $i -eq 0 ]; then + nj=`cat $decode_dir/num_jobs` || exit 1; + cp $model $dir || exit 1 + cp $tree $dir || exit 1 + echo $nj > $dir/num_jobs + else + if [ $nj != `cat $decode_dir/num_jobs` ]; then + echo "$0: number of decoding jobs mismatches, $nj versus `cat $decode_dir/num_jobs`" + exit 1; + fi + fi + + $cmd JOB=1:$nj $dir/log/get_post.$i.JOB.log \ + lattice-to-post --acoustic-scale=$acwt \ + "ark,s,cs:gunzip -c $decode_dir/lat.JOB.gz|" ark:- \| \ + post-to-pdf-post $model ark,s,cs:- ark:- \| \ + get-post-on-ali ark,s,cs:- "ark,s,cs:gunzip -c $dir/ali.JOB.gz | convert-ali $dir/final.mdl $model $tree ark,s,cs:- ark:- | ali-to-pdf $model ark,s,cs:- ark:- |" "ark,scp:$fdir/weights.$i.JOB.ark,$fdir/weights.$i.JOB.scp" || exit 1 + fi +done + +if [ $stage -lt $num_sys ]; then + if [ "$num_sys" -eq 1 ]; then + for n in `seq $nj`; do + cat $dir/weights.0.$n.scp + done > $dir/weights.scp + else + $cmd JOB=1:$nj $dir/log/interpolate_post.JOB.log \ + vector-sum $file_list ark:- \| \ + vector-scale --scale=$inv_weights_sum ark:- \ + ark,scp:$fdir/weights.JOB.ark,$fdir/weights.JOB.scp || exit 1 + + for n in `seq $nj`; do + cat $dir/weights.$n.scp + done > $dir/weights.scp + fi +fi + +for n in `seq 1 $[num_sys-1]`; do + rm $dir/weights.$n.*.ark $dir/weights.$n.*.scp +done + +exit 0 diff --git a/egs/wsj/s5/steps/conf/apply_calibration.sh b/egs/wsj/s5/steps/conf/apply_calibration.sh index c1a22e274b8..48f9e17d30b 100755 --- a/egs/wsj/s5/steps/conf/apply_calibration.sh +++ b/egs/wsj/s5/steps/conf/apply_calibration.sh @@ -28,6 +28,7 @@ caldir=$4 dir=$5 model=$latdir/../final.mdl # assume model one level up from decoding dir. +model_dir=$latdir/.. calibration=$caldir/calibration.mdl word_feats=$caldir/word_feats word_categories=$caldir/word_categories @@ -49,6 +50,12 @@ cp $calibration $dir/calibration.mdl cp $word_feats $dir/word_feats cp $word_categories $dir/word_categories +frame_shift_opt= +if [ -f $model_dir/frame_subsampling_factor ]; then + frame_subsampling_factor=$(cat $model_dir/frame_subsampling_factor) + frame_shift_opt="--frame-shift=0.0$frame_subsampling_factor" +fi + # Create the ctm with raw confidences, # - we keep the timing relative to the utterance, if [ $stage -le 0 ]; then @@ -58,7 +65,7 @@ if [ $stage -le 0 ]; then lattice-push --push-strings=false ark:- ark:- \| \ lattice-align-words-lexicon --max-expand=10.0 \ $lang/phones/align_lexicon.int $model ark:- ark:- \| \ - lattice-to-ctm-conf --decode-mbr=$decode_mbr ark:- - \| \ + lattice-to-ctm-conf --decode-mbr=$decode_mbr $frame_shift_opt ark:- - \| \ utils/int2sym.pl -f 5 $lang/words.txt \ '>' $dir/JOB.ctm # Merge and clean, @@ -76,7 +83,7 @@ fi # Create the forwarding data for logistic regression, if [ $stage -le 2 ]; then steps/conf/prepare_calibration_data.py --conf-feats $dir/forward_feats.ark \ - --lattice-depth $latdepth $dir/ctm_int $word_feats $word_categories + --lattice-depth $latdepth $frame_shift_opt $dir/ctm_int $word_feats $word_categories fi # Apply calibration model to dev, diff --git a/egs/wsj/s5/steps/conf/convert_ctm_to_weights.py b/egs/wsj/s5/steps/conf/convert_ctm_to_weights.py new file mode 100755 index 00000000000..02a616b2c03 --- /dev/null +++ b/egs/wsj/s5/steps/conf/convert_ctm_to_weights.py @@ -0,0 +1,101 @@ +#! /usr/bin/env python + +import argparse +import logging +import sys + +sys.path.insert(0, 'steps') +import libs.common as common_lib + +logger = logging.getLogger('libs') +logger.setLevel(logging.INFO) +handler = logging.StreamHandler() +handler.setLevel(logging.INFO) +formatter = logging.Formatter("%(asctime)s [%(pathname)s:%(lineno)s - " + "%(funcName)s - %(levelname)s ] %(message)s") +handler.setFormatter(formatter) +logger.addHandler(handler) + + +def get_args(): + parser = argparse.ArgumentParser( + description="""This script converts CTM to per-frame weights by the word + posteriors in the CTM as the weights.""") + + parser.add_argument("--frame-shift", type=float, default=0.01, + help="Frame shift value in seconds") + parser.add_argument("--default-weight", type=float, default=1.0, + help="Default weight on silence frames") + parser.add_argument("segments_in", type=str, help="Input segments file") + parser.add_argument("ctm_in", type=str, help="Input utterance-level CTM " + "file i.e. the first column has utterance-ids") + parser.add_argument("weights_out", type=str, help="Output per-frame " + "weights vector written in Kaldi text archive format") + + args = parser.parse_args() + + return args + + +def run(args): + utt2num_frames = {} + with common_lib.smart_open(args.segments_in) as segments_reader: + for line in segments_reader.readlines(): + parts = line.strip().split() + if len(parts) not in [4, 5]: + raise RuntimeError("Invalid line {0} in segments file {1}" + "".format(line.strip(), args.segments_in)) + utt2num_frames[parts[0]] = int((float(parts[3]) - float(parts[2])) + / args.frame_shift + 0.5) + + num_utt = 0 + with common_lib.smart_open(args.ctm_in) as ctm_reader, \ + common_lib.smart_open(args.weights_out, 'w') as weights_writer: + prev_utt = None + weights = [] + for line in ctm_reader.readlines(): + parts = line.strip().split() + if len(parts) not in [5, 6]: + raise RuntimeError("Invalid line {0} in CTM file {1}" + "".format(line.strip(), args.ctm_in)) + + utt = parts[0] + if utt != prev_utt: + if prev_utt is not None: + assert len(weights) >= utt2num_frames[prev_utt] + common_lib.write_vector_ascii(weights_writer, weights, + key=prev_utt) + weights = [args.default_weight for x in + range(utt2num_frames[utt])] + + start_time = float(parts[2]) + dur = float(parts[3]) + prob = 1.0 if len(parts) == 5 else float(parts[5]) + + start_frame = int(start_time / args.frame_shift + 0.5) + length = int(dur / args.frame_shift) + + if len(weights) < start_frame + length: + weights.extend([args.default_weight for x in + range(len(weights), start_frame + length)]) + for x in range(start_frame, start_frame + length): + weights[x] = prob + + assert len(weights) >= start_frame + length + prev_utt = utt + num_utt += 1 + assert len(weights) >= utt2num_frames[prev_utt] + common_lib.write_vector_ascii(weights_writer, weights, + key=prev_utt) + + if num_utt == 0: + raise RuntimeError("Failed to process any utterances") + + +def main(): + args = get_args() + run(args) + + +if __name__ == "__main__": + main() diff --git a/egs/wsj/s5/steps/conf/prepare_calibration_data.py b/egs/wsj/s5/steps/conf/prepare_calibration_data.py index bc8f92a2f7f..753771b1d89 100755 --- a/egs/wsj/s5/steps/conf/prepare_calibration_data.py +++ b/egs/wsj/s5/steps/conf/prepare_calibration_data.py @@ -10,7 +10,7 @@ Prepare input features and training targets for logistic regression, which calibrates the Minimum Bayes Risk posterior confidences. -The logisitc-regression input features are: +The logisitc-regression input features are: - posteriors from 'ctm' transformed by logit, - logarithm of word-length in letters, - 10base logarithm of unigram probability of a word from language model, @@ -34,6 +34,8 @@ parser.add_option("--conf-targets", help="Targets file for logistic regression (no targets generated if '') [default %default]", default='') parser.add_option("--conf-feats", help="Feature file for logistic regression. [default %default]", default='') parser.add_option("--lattice-depth", help="Per-frame lattice depths, ascii-ark (optional). [default %default]", default='') +parser.add_option("--frame-shift", type=float, default=0.01, + help="Frame shift value in seconds [default %default]") (o, args) = parser.parse_args() if len(args) != 3: @@ -63,11 +65,11 @@ if o.conf_targets != '': with open(o.conf_targets,'w') as f: for (utt, chan, beg, dur, wrd_id, conf, score_tag) in ctm: - # Skip the words we don't know if being correct, - if score_tag == 'U': continue + # Skip the words we don't know if being correct, + if score_tag == 'U': continue # Some words are excluded from training (partial words, hesitations, etc.), # (Value: 1 == keep word, 0 == exclude word from the targets), - if not word_filter[wrd_id]: continue + if not word_filter[wrd_id]: continue # Build the key, key = "%s^%s^%s^%s^%s,%s,%s" % (utt, chan, beg, dur, wrd_id, conf, score_tag) # Build the target, @@ -102,7 +104,7 @@ # - log of word-length, log_word_length = math.log(word_length[wrd_id]) # i.e. number of phones in a word, # - categorical distribution of words (with frequency higher than min-count), - wrd_1_of_k = [0]*wrd_cat_num; + wrd_1_of_k = [0]*wrd_cat_num; wrd_1_of_k[wrd_to_cat[wrd_id]] = 1; # Compose the input feature vector, @@ -110,10 +112,10 @@ # Optionally add average-depth of lattice at the word position, if o.lattice_depth != '': - depth_slice = depths[utt][int(round(100.0*float(beg))):int(round(100.0*(float(beg)+float(dur))))] + depth_slice = depths[utt][int(float(beg) / o.frame_shift + 0.5):int((float(beg) + max(o.frame_shift, float(dur))) / o.frame_shift + 0.5)] log_avg_depth = math.log(float(sum(depth_slice))/len(depth_slice)) feats += [ log_avg_depth ] - # Store the input features, + # Store the input features, f.write(key + ' [ ' + ' '.join(map(str,feats)) + ' ]\n') diff --git a/egs/wsj/s5/steps/conf/train_calibration.sh b/egs/wsj/s5/steps/conf/train_calibration.sh index c2aca05056e..9a8451c9f85 100755 --- a/egs/wsj/s5/steps/conf/train_calibration.sh +++ b/egs/wsj/s5/steps/conf/train_calibration.sh @@ -12,7 +12,7 @@ # (- categorical distribution of 'lang/words.txt', DISABLED) # begin configuration section. -cmd= +cmd=run.pl lmwt=12 decode_mbr=true word_min_count=10 # Minimum word-count for single-word category, @@ -43,6 +43,7 @@ latdir=$4 dir=$5 model=$latdir/../final.mdl # assume model one level up from decoding dir. +model_dir=$latdir/.. for f in $data/text $lang/words.txt $word_feats $latdir/lat.1.gz; do [ ! -f $f ] && echo "$0: Missing file $f" && exit 1 @@ -57,6 +58,12 @@ echo $lmwt >$dir/lmwt echo $decode_mbr >$dir/decode_mbr cp $word_feats $dir/word_feats +frame_shift_opt= +if [ -f $model_dir/frame_subsampling_factor ]; then + frame_subsampling_factor=$(cat $model_dir/frame_subsampling_factor) + frame_shift_opt="--frame-shift=0.0$frame_subsampling_factor" +fi + # Create the ctm with raw confidences, # - we keep the timing relative to the utterance, if [ $stage -le 0 ]; then @@ -66,7 +73,7 @@ if [ $stage -le 0 ]; then lattice-push --push-strings=false ark:- ark:- \| \ lattice-align-words-lexicon --max-expand=10.0 \ $lang/phones/align_lexicon.int $model ark:- ark:- \| \ - lattice-to-ctm-conf --decode-mbr=$decode_mbr ark:- - \| \ + lattice-to-ctm-conf --decode-mbr=$decode_mbr $frame_shift_opt ark:- - \| \ utils/int2sym.pl -f 5 $lang/words.txt \ '>' $dir/JOB.ctm # Merge and clean, @@ -104,7 +111,7 @@ fi if [ $stage -le 3 ]; then steps/conf/prepare_calibration_data.py \ --conf-targets $dir/train_targets.ark --conf-feats $dir/train_feats.ark \ - --lattice-depth $latdepth $dir/ctm_aligned_int $word_feats $dir/word_categories + --lattice-depth $latdepth $frame_shift_opt $dir/ctm_aligned_int $word_feats $dir/word_categories fi # Train the logistic regression, diff --git a/egs/wsj/s5/steps/libs/common.py b/egs/wsj/s5/steps/libs/common.py index c76953681f4..1de40e91c61 100644 --- a/egs/wsj/s5/steps/libs/common.py +++ b/egs/wsj/s5/steps/libs/common.py @@ -9,11 +9,14 @@ commonly used in many kaldi python scripts. """ +from __future__ import print_function import argparse import logging import math import os +import re import subprocess +import sys import threading logger = logging.getLogger(__name__) @@ -66,6 +69,35 @@ def __call__(self, parser, namespace, values, option_string=None): setattr(namespace, self.dest, values) +class smart_open(object): + """ + This class is designed to be used with the "with" construct in python + to open files. It is similar to the python open() function, but + treats the input "-" specially to return either sys.stdout or sys.stdin + depending on whether the mode is "w" or "r". + + e.g.: with smart_open(filename, 'w') as fh: + print ("foo", file=fh) + """ + def __init__(self, filename, mode="r"): + self.filename = filename + self.mode = mode + assert self.mode == "w" or self.mode == "r" + + def __enter__(self): + if self.filename == "-" and self.mode == "w": + self.file_handle = sys.stdout + elif self.filename == "-" and self.mode == "r": + self.file_handle = sys.stdin + else: + self.file_handle = open(self.filename, self.mode) + return self.file_handle + + def __exit__(self, *args): + if self.filename != "-": + self.file_handle.close() + + def check_if_cuda_compiled(): p = subprocess.Popen("cuda-compiled") p.communicate() @@ -202,9 +234,10 @@ def get_number_of_leaves_from_model(dir): def get_number_of_jobs(alidir): try: num_jobs = int(open('{0}/num_jobs'.format(alidir)).readline().strip()) - except (IOError, ValueError) as e: - raise Exception("Exception while reading the " - "number of alignment jobs: {0}".format(e.errstr)) + except IOError, ValueError: + logger.error("Exception while reading the " + "number of alignment jobs: ", exc_info=True) + raise SystemExit(1) return num_jobs @@ -284,6 +317,130 @@ def write_kaldi_matrix(output_file, matrix): f.write(" ]") +def write_matrix_ascii(file_or_fd, mat, key=None): + try: + fd = open(file_or_fd, 'w') + except TypeError: + # 'file_or_fd' is opened file descriptor, + fd = file_or_fd + + try: + if key is not None: + print ("{0} [".format(key), + file=fd) # ark-files have keys (utterance-id) + else: + print (" [", file=fd) + + num_cols = 0 + for i, row in enumerate(mat): + line = ' '.join(["{0:f}".format(x) for x in row]) + if i == 0: + num_cols = len(row) + elif len(row) != num_cols: + raise Exception("All the rows of a matrix are expected to " + "have the same length") + + if i == len(mat) - 1: + line += " ]" + print (line, file=fd) + finally: + if fd is not file_or_fd : fd.close() + + +def write_vector_ascii(file_or_fd, vec, key=None): + try: + fd = open(file_or_fd, 'w') + except TypeError: + # 'file_or_fd' is opened file descriptor, + fd = file_or_fd + + try: + if key is not None: + print ("{0} [".format(key), + file=fd, end=' ') # ark-files have keys (utterance-id) + else: + print (" [", file=fd, end=' ') + + line = ' '.join(["{0:f}".format(x) for x in vec]) + line += " ]" + print (line, file=fd) + finally: + if fd is not file_or_fd : fd.close() + + +def read_matrix_ascii(file_or_fd): + try: + fd = open(file_or_fd, 'r') + fname = file_or_fd + except TypeError: + # 'file_or_fd' is opened file descriptor, + fd = file_or_fd + fname = file_or_fd.name + + first = fd.read(2) + if first != ' [': + logger.error( + "Kaldi matrix file %s has incorrect format, " + "only text format matrix files can be read by this script", + fname) + raise RuntimeError + + rows = [] + while True: + line = fd.readline() + if len(line) == 0: + logger.error("Kaldi matrix file %s has incorrect format; " + "got EOF before end of matrix", fname) + if len(line.strip()) == 0 : continue # skip empty line + arr = line.strip().split() + if arr[-1] != ']': + rows.append([float(x) for x in arr]) # not last line + else: + rows.append([float(x) for x in arr[:-1]]) # lastline + return rows + if fd is not file_or_fd: + fd.close() + + +def read_key(fd): + """ [str] = read_key(fd) + Read the utterance-key from the opened ark/stream descriptor 'fd'. + """ + str_ = '' + while True: + char = fd.read(1) + if char == '': + break + if char == ' ': + break + str_ += char + str_ = str_.strip() + if str_ == '': + return None # end of file, + assert (re.match('^[\.a-zA-Z0-9_-]+$', str_) is not None) # check format, + return str_ + + +def read_mat_ark(file_or_fd): + try: + fd = open(file_or_fd, 'r') + fname = file_or_fd + except TypeError: + # 'file_or_fd' is opened file descriptor, + fd = file_or_fd + fname = file_or_fd.name + + try: + key = read_key(fd) + while key: + mat = read_matrix_ascii(fd) + yield key, mat + key = read_key(fd) + finally: + if fd is not file_or_fd: + fd.close() + + def force_symlink(file1, file2): import errno try: diff --git a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py index 7c9706c9ad4..5d89c1c4ba8 100644 --- a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py +++ b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py @@ -130,7 +130,8 @@ def train_new_models(dir, iter, srand, num_jobs, l2_regularize, xent_regularize, leaky_hmm_coefficient, momentum, max_param_change, shuffle_buffer_size, num_chunk_per_minibatch_str, - frame_subsampling_factor, run_opts): + frame_subsampling_factor, run_opts, + use_multitask_egs=False): """ Called from train_one_iteration(), this method trains new models with 'num_jobs' jobs, and @@ -141,6 +142,12 @@ def train_new_models(dir, iter, srand, num_jobs, to use for each job is a little complex, so we spawn each one separately. this is no longer true for RNNs as we use do not use the --frame option but we use the same script for consistency with FF-DNN code + + use_multitask_egs : True, if different examples used to train multiple + tasks or outputs, e.g.multilingual training. + multilingual egs can be generated using get_egs.sh and + steps/nnet3/multilingual/allocate_multilingual_examples.py, + those are the top-level scripts. """ deriv_time_opts = [] @@ -168,6 +175,12 @@ def train_new_models(dir, iter, srand, num_jobs, frame_shift = ((archive_index + k/num_archives) % frame_subsampling_factor) + multitask_egs_opts = common_train_lib.get_multitask_egs_opts( + egs_dir, + egs_prefix="cegs.", + archive_index=archive_index, + use_multitask_egs=use_multitask_egs) + scp_or_ark = "scp" if use_multitask_egs else "ark" cache_io_opts = (("--read-cache={dir}/cache.{iter}".format(dir=dir, iter=iter) if iter > 0 else "") + @@ -184,9 +197,9 @@ def train_new_models(dir, iter, srand, num_jobs, --print-interval=10 --momentum={momentum} \ --max-param-change={max_param_change} \ "{raw_model}" {dir}/den.fst \ - "ark,bg:nnet3-chain-copy-egs \ + "ark,bg:nnet3-chain-copy-egs {multitask_egs_opts} \ --frame-shift={fr_shft} \ - ark:{egs_dir}/cegs.{archive_index}.ark ark:- | \ + {scp_or_ark}:{egs_dir}/cegs.{archive_index}.{scp_or_ark} ark:- | \ nnet3-chain-shuffle-egs --buffer-size={buf_size} \ --srand={srand} ark:- ark:- | nnet3-chain-merge-egs \ --minibatch-size={num_chunk_per_mb} ark:- ark:- |" \ @@ -206,17 +219,17 @@ def train_new_models(dir, iter, srand, num_jobs, raw_model=raw_model_string, egs_dir=egs_dir, archive_index=archive_index, buf_size=shuffle_buffer_size, - num_chunk_per_mb=num_chunk_per_minibatch_str), + num_chunk_per_mb=num_chunk_per_minibatch_str, + multitask_egs_opts=multitask_egs_opts, + scp_or_ark=scp_or_ark), require_zero_status=True) threads.append(thread) - for thread in threads: thread.join() - def train_one_iteration(dir, iter, srand, egs_dir, num_jobs, num_archives_processed, num_archives, learning_rate, shrinkage_value, @@ -227,7 +240,8 @@ def train_one_iteration(dir, iter, srand, egs_dir, leaky_hmm_coefficient, momentum, max_param_change, shuffle_buffer_size, frame_subsampling_factor, - run_opts, dropout_edit_string=""): + run_opts, dropout_edit_string="", + use_multitask_egs=False): """ Called from steps/nnet3/chain/train.py for one iteration for neural network training with LF-MMI objective @@ -259,7 +273,8 @@ def train_one_iteration(dir, iter, srand, egs_dir, compute_train_cv_probabilities( dir=dir, iter=iter, egs_dir=egs_dir, l2_regularize=l2_regularize, xent_regularize=xent_regularize, - leaky_hmm_coefficient=leaky_hmm_coefficient, run_opts=run_opts) + leaky_hmm_coefficient=leaky_hmm_coefficient, run_opts=run_opts, + use_multitask_egs=use_multitask_egs) if iter > 0: # Runs in the background @@ -311,7 +326,8 @@ def train_one_iteration(dir, iter, srand, egs_dir, shuffle_buffer_size=shuffle_buffer_size, num_chunk_per_minibatch_str=cur_num_chunk_per_minibatch_str, frame_subsampling_factor=frame_subsampling_factor, - run_opts=run_opts) + run_opts=run_opts, + use_multitask_egs=use_multitask_egs) [models_to_average, best_model] = common_train_lib.get_successful_models( num_jobs, '{0}/log/train.{1}.%.log'.format(dir, iter)) @@ -363,7 +379,7 @@ def check_for_required_files(feat_dir, tree_dir, lat_dir): def compute_preconditioning_matrix(dir, egs_dir, num_lda_jobs, run_opts, max_lda_jobs=None, rand_prune=4.0, - lda_opts=None): + lda_opts=None, use_multitask_egs=False): """ Function to estimate and write LDA matrix from cegs This function is exactly similar to the version in module @@ -373,17 +389,28 @@ def compute_preconditioning_matrix(dir, egs_dir, num_lda_jobs, run_opts, if max_lda_jobs is not None: if num_lda_jobs > max_lda_jobs: num_lda_jobs = max_lda_jobs + multitask_egs_opts = common_train_lib.get_multitask_egs_opts( + egs_dir, + egs_prefix="cegs.", + archive_index="JOB", + use_multitask_egs=use_multitask_egs) + scp_or_ark = "scp" if use_multitask_egs else "ark" + egs_rspecifier = ( + "ark:nnet3-chain-copy-egs {multitask_egs_opts} " + "{scp_or_ark}:{egs_dir}/cegs.JOB.{scp_or_ark} ark:- |" + "".format(egs_dir=egs_dir, scp_or_ark=scp_or_ark, + multitask_egs_opts=multitask_egs_opts)) # Write stats with the same format as stats for LDA. common_lib.execute_command( """{command} JOB=1:{num_lda_jobs} {dir}/log/get_lda_stats.JOB.log \ nnet3-chain-acc-lda-stats --rand-prune={rand_prune} \ - {dir}/init.raw "ark:{egs_dir}/cegs.JOB.ark" \ + {dir}/init.raw "{egs_rspecifier}" \ {dir}/JOB.lda_stats""".format( command=run_opts.command, num_lda_jobs=num_lda_jobs, dir=dir, - egs_dir=egs_dir, + egs_rspecifier=egs_rspecifier, rand_prune=rand_prune)) # the above command would have generated dir/{1..num_lda_jobs}.lda_stats @@ -438,32 +465,50 @@ def prepare_initial_acoustic_model(dir, run_opts, srand=-1): def compute_train_cv_probabilities(dir, iter, egs_dir, l2_regularize, xent_regularize, leaky_hmm_coefficient, - run_opts): + run_opts, + use_multitask_egs=False): model = '{0}/{1}.mdl'.format(dir, iter) + scp_or_ark = "scp" if use_multitask_egs else "ark" + egs_suffix = ".scp" if use_multitask_egs else ".cegs" + + multitask_egs_opts = common_train_lib.get_multitask_egs_opts( + egs_dir, + egs_prefix="valid_diagnostic.", + use_multitask_egs=use_multitask_egs) + common_lib.background_command( """{command} {dir}/log/compute_prob_valid.{iter}.log \ nnet3-chain-compute-prob --l2-regularize={l2} \ --leaky-hmm-coefficient={leaky} --xent-regularize={xent_reg} \ "nnet3-am-copy --raw=true {model} - |" {dir}/den.fst \ - "ark,bg:nnet3-chain-copy-egs ark:{egs_dir}/valid_diagnostic.cegs \ + "ark,bg:nnet3-chain-copy-egs {multitask_egs_opts} {scp_or_ark}:{egs_dir}/valid_diagnostic{egs_suffix} \ ark:- | nnet3-chain-merge-egs --minibatch-size=1:64 ark:- ark:- |" \ """.format(command=run_opts.command, dir=dir, iter=iter, model=model, l2=l2_regularize, leaky=leaky_hmm_coefficient, xent_reg=xent_regularize, - egs_dir=egs_dir)) + egs_dir=egs_dir, + multitask_egs_opts=multitask_egs_opts, + scp_or_ark=scp_or_ark, egs_suffix=egs_suffix)) + + multitask_egs_opts = common_train_lib.get_multitask_egs_opts( + egs_dir, + egs_prefix="train_diagnostic.", + use_multitask_egs=use_multitask_egs) common_lib.background_command( """{command} {dir}/log/compute_prob_train.{iter}.log \ nnet3-chain-compute-prob --l2-regularize={l2} \ --leaky-hmm-coefficient={leaky} --xent-regularize={xent_reg} \ "nnet3-am-copy --raw=true {model} - |" {dir}/den.fst \ - "ark,bg:nnet3-chain-copy-egs ark:{egs_dir}/train_diagnostic.cegs \ + "ark,bg:nnet3-chain-copy-egs {multitask_egs_opts} {scp_or_ark}:{egs_dir}/train_diagnostic{egs_suffix} \ ark:- | nnet3-chain-merge-egs --minibatch-size=1:64 ark:- ark:- |" \ """.format(command=run_opts.command, dir=dir, iter=iter, model=model, l2=l2_regularize, leaky=leaky_hmm_coefficient, xent_reg=xent_regularize, - egs_dir=egs_dir)) + egs_dir=egs_dir, + multitask_egs_opts=multitask_egs_opts, + scp_or_ark=scp_or_ark, egs_suffix=egs_suffix)) def compute_progress(dir, iter, run_opts): @@ -483,10 +528,12 @@ def compute_progress(dir, iter, run_opts): model=model, prev_model=prev_model)) + def combine_models(dir, num_iters, models_to_combine, num_chunk_per_minibatch_str, egs_dir, leaky_hmm_coefficient, l2_regularize, xent_regularize, run_opts, - sum_to_one_penalty=0.0): + sum_to_one_penalty=0.0, + use_multitask_egs=False): """ Function to do model combination In the nnet3 setup, the logic @@ -512,6 +559,14 @@ def combine_models(dir, num_iters, models_to_combine, num_chunk_per_minibatch_st print("{0}: warning: model file {1} does not exist " "(final combination)".format(sys.argv[0], model_file)) + scp_or_ark = "scp" if use_multitask_egs else "ark" + egs_suffix = ".scp" if use_multitask_egs else ".cegs" + + multitask_egs_opts = common_train_lib.get_multitask_egs_opts( + egs_dir, + egs_prefix="combine.", + use_multitask_egs=use_multitask_egs) + # We reverse the order of the raw model strings so that the freshest one # goes first. This is important for systems that include batch # normalization-- it means that the freshest batch-norm stats are used. @@ -529,7 +584,7 @@ def combine_models(dir, num_iters, models_to_combine, num_chunk_per_minibatch_st --sum-to-one-penalty={penalty} \ --enforce-positive-weights=true \ --verbose=3 {dir}/den.fst {raw_models} \ - "ark,bg:nnet3-chain-copy-egs ark:{egs_dir}/combine.cegs ark:- | \ + "ark,bg:nnet3-chain-copy-egs {multitask_egs_opts} {scp_or_ark}:{egs_dir}/combine{egs_suffix} ark:- | \ nnet3-chain-merge-egs --minibatch-size={num_chunk_per_mb} \ ark:- ark:- |" - \| \ nnet3-am-copy --set-raw-nnet=- {dir}/{num_iters}.mdl \ @@ -544,7 +599,9 @@ def combine_models(dir, num_iters, models_to_combine, num_chunk_per_minibatch_st penalty=sum_to_one_penalty, num_chunk_per_mb=num_chunk_per_minibatch_str, num_iters=num_iters, - egs_dir=egs_dir)) + egs_dir=egs_dir, + multitask_egs_opts=multitask_egs_opts, + scp_or_ark=scp_or_ark, egs_suffix=egs_suffix)) # Compute the probability of the final, combined model with # the same subset we used for the previous compute_probs, as the @@ -553,4 +610,5 @@ def combine_models(dir, num_iters, models_to_combine, num_chunk_per_minibatch_st dir=dir, iter='final', egs_dir=egs_dir, l2_regularize=l2_regularize, xent_regularize=xent_regularize, leaky_hmm_coefficient=leaky_hmm_coefficient, - run_opts=run_opts) + run_opts=run_opts, + use_multitask_egs=use_multitask_egs) diff --git a/egs/wsj/s5/steps/libs/nnet3/train/common.py b/egs/wsj/s5/steps/libs/nnet3/train/common.py index 49565e6bc7e..854f74cc556 100644 --- a/egs/wsj/s5/steps/libs/nnet3/train/common.py +++ b/egs/wsj/s5/steps/libs/nnet3/train/common.py @@ -399,7 +399,7 @@ def verify_egs_dir(egs_dir, feat_dim, ivector_dim, ivector_extractor_id, if (feat_dim != 0 and feat_dim != egs_feat_dim) or (ivector_dim != egs_ivector_dim): raise Exception("There is mismatch between featdim/ivector_dim of " "the current experiment and the provided " - "egs directory") + "egs directory: egs_dim: {0} vs {1} and ivector_dim {2} vs {3}".format(feat_dim, egs_feat_dim, ivector_dim, egs_ivector_dim)) if (((egs_ivector_id is None) and (ivector_extractor_id is not None)) or ((egs_ivector_id is not None) and (ivector_extractor_id is None))): diff --git a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py index 057ad25d58b..c775bd4ff41 100644 --- a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py +++ b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py @@ -310,21 +310,32 @@ def train_one_iteration(dir, iter, srand, egs_dir, def compute_preconditioning_matrix(dir, egs_dir, num_lda_jobs, run_opts, max_lda_jobs=None, rand_prune=4.0, - lda_opts=None): + lda_opts=None, use_multitask_egs=False): if max_lda_jobs is not None: if num_lda_jobs > max_lda_jobs: num_lda_jobs = max_lda_jobs + multitask_egs_opts = common_train_lib.get_multitask_egs_opts( + egs_dir, + egs_prefix="egs.", + archive_index="JOB", + use_multitask_egs=use_multitask_egs) + scp_or_ark = "scp" if use_multitask_egs else "ark" + egs_rspecifier = ( + "ark:nnet3-copy-egs {multitask_egs_opts} " + "{scp_or_ark}:{egs_dir}/egs.JOB.{scp_or_ark} ark:- |" + "".format(egs_dir=egs_dir, scp_or_ark=scp_or_ark, + multitask_egs_opts=multitask_egs_opts)) # Write stats with the same format as stats for LDA. common_lib.execute_command( """{command} JOB=1:{num_lda_jobs} {dir}/log/get_lda_stats.JOB.log \ nnet3-acc-lda-stats --rand-prune={rand_prune} \ - {dir}/init.raw "ark:{egs_dir}/egs.JOB.ark" \ + {dir}/init.raw "{egs_rspecifier}" \ {dir}/JOB.lda_stats""".format( command=run_opts.command, num_lda_jobs=num_lda_jobs, dir=dir, - egs_dir=egs_dir, + egs_rspecifier=egs_rspecifier, rand_prune=rand_prune)) # the above command would have generated dir/{1..num_lda_jobs}.lda_stats @@ -412,7 +423,6 @@ def compute_train_cv_probabilities(dir, iter, egs_dir, run_opts, multitask_egs_opts=multitask_egs_opts)) - def compute_progress(dir, iter, egs_dir, run_opts, get_raw_nnet_from_am=True, diff --git a/egs/wsj/s5/steps/nnet3/chain/build_tree_from_lats.sh b/egs/wsj/s5/steps/nnet3/chain/build_tree_from_lats.sh new file mode 100755 index 00000000000..6ed988062b3 --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/chain/build_tree_from_lats.sh @@ -0,0 +1,201 @@ +#!/bin/bash +# Copyright 2012-2015 Johns Hopkins University (Author: Daniel Povey). +# Apache 2.0. + + +# This script builds a tree for use in the 'chain' systems (although the script +# itself is pretty generic and doesn't use any 'chain' binaries). This is just +# like the first stages of a standard system, like 'train_sat.sh', except it +# does 'convert-ali' to convert alignments to a monophone topology just created +# from the 'lang' directory (in case the topology is different from where you +# got the system's alignments from), and it stops after the tree-building and +# model-initialization stage, without re-estimating the Gaussians or training +# the transitions. + + +# Begin configuration section. +stage=-5 +exit_stage=-100 # you can use this to require it to exit at the + # beginning of a specific stage. Not all values are + # supported. +cmd=run.pl +context_opts= # e.g. set this to "--context-width 5 --central-position 2" for quinphone. +cluster_thresh=-1 # for build-tree control final bottom-up clustering of leaves +frame_subsampling_factor=1 +alignment_subsampling_factor=1 +leftmost_questions_truncate=-1 # note: this used to default to 10, but we never + # use this option now with value != -1, and + # we're changing the default +acwt=0.1 +tree_stats_opts= +cluster_phones_opts= +repeat_frames=false +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +[ -f path.sh ] && . ./path.sh +. parse_options.sh || exit 1; + +if [ $# != 5 ]; then + echo "Usage: steps/train_sat.sh <#leaves> " + echo " e.g.: steps/train_sat.sh 2500 15000 data/train_si84 data/lang exp/tri2b_lats_si84 exp/tri3b" + echo "Main options (for others, see top of script file)" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --config # config containing options" + echo " --stage # stage to do partial re-run from." + echo " --repeat-frames # Only affects alignment conversion at" + echo " # the end. If true, generate an " + echo " # alignment using the frame-subsampled " + echo " # topology that is repeated " + echo " # --frame-subsampling-factor times " + echo " # and interleaved, to be the same " + echo " # length as the original alignment " + echo " # (useful for cross-entropy training " + echo " # of reduced frame rate systems)." + exit 1; +fi + +numleaves=$1 +data=$2 +lang=$3 +lat_dir=$4 +dir=$5 + +for f in $data/feats.scp $lang/phones.txt $lat_dir/final.mdl $lat_dir/lat.1.gz; do + [ ! -f $f ] && echo "train_sat.sh: no such file $f" && exit 1; +done + +oov=`cat $lang/oov.int` +nj=`cat $lat_dir/num_jobs` || exit 1; +silphonelist=`cat $lang/phones/silence.csl` +ciphonelist=`cat $lang/phones/context_indep.csl` || exit 1; +sdata=$data/split$nj; +splice_opts=`cat $lat_dir/splice_opts 2>/dev/null` # frame-splicing options. +cmvn_opts=`cat $lat_dir/cmvn_opts 2>/dev/null` +delta_opts=`cat $lat_dir/delta_opts 2>/dev/null` + +mkdir -p $dir/log +cp $lat_dir/splice_opts $dir 2>/dev/null # frame-splicing options. +cp $lat_dir/cmvn_opts $dir 2>/dev/null # cmn/cmvn option. +cp $lat_dir/delta_opts $dir 2>/dev/null # delta option. + +utils/lang/check_phones_compatible.sh $lang/phones.txt $lat_dir/phones.txt || exit 1; +cp $lang/phones.txt $dir || exit 1; + +echo $nj >$dir/num_jobs +[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; + +# Set up features. + +if [ -f $lat_dir/final.mat ]; then feat_type=lda; else feat_type=delta; fi +echo "$0: feature type is $feat_type" + +## Set up speaker-independent features. +case $feat_type in + delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |";; + lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $lat_dir/final.mat ark:- ark:- |" + cp $lat_dir/final.mat $dir + cp $lat_dir/full.mat $dir 2>/dev/null + ;; + *) echo "$0: invalid feature type $feat_type" && exit 1; +esac + +# Add fMLLR transforms if available +if [ -f $lat_dir/trans.1 ]; then + echo "$0: Using transforms from $lat_dir" + feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$lat_dir/trans.JOB ark:- ark:- |" +fi + +# Do subsampling of feats, if needed +if [ $frame_subsampling_factor -gt 1 ]; then + feats="$feats subsample-feats --n=$frame_subsampling_factor ark:- ark:- |" +fi + +if [ $stage -le -5 ]; then + echo "$0: Initializing monophone model (for alignment conversion, in case topology changed)" + + [ ! -f $lang/phones/sets.int ] && exit 1; + shared_phones_opt="--shared-phones=$lang/phones/sets.int" + # get feature dimension + example_feats="`echo $feats | sed s/JOB/1/g`"; + if ! feat_dim=$(feat-to-dim "$example_feats" - 2>/dev/null) || [ -z $feat_dim ]; then + feat-to-dim "$example_feats" - # to see the error message. + echo "error getting feature dimension" + exit 1; + fi + $cmd JOB=1 $dir/log/init_mono.log \ + gmm-init-mono $shared_phones_opt "--train-feats=$feats subset-feats --n=10 ark:- ark:-|" $lang/topo $feat_dim \ + $dir/mono.mdl $dir/mono.tree || exit 1; +fi + + +if [ $stage -le -4 ]; then + # Get tree stats. + echo "$0: Accumulating tree stats" + $cmd JOB=1:$nj $dir/log/acc_tree.JOB.log \ + lattice-best-path --acoustic-scale=$acwt \ + "ark:gunzip -c $lat_dir/lat.JOB.gz|" ark:/dev/null ark:- \| \ + convert-ali --frame-subsampling-factor=$alignment_subsampling_factor \ + $lat_dir/final.mdl $dir/mono.mdl $dir/mono.tree ark:- ark:- \| \ + acc-tree-stats $context_opts $tree_stats_opts --ci-phones=$ciphonelist $dir/mono.mdl \ + "$feats" ark:- $dir/JOB.treeacc || exit 1; + [ "`ls $dir/*.treeacc | wc -w`" -ne "$nj" ] && echo "$0: Wrong #tree-accs" && exit 1; + $cmd $dir/log/sum_tree_acc.log \ + sum-tree-stats $dir/treeacc $dir/*.treeacc || exit 1; + rm $dir/*.treeacc +fi + +if [ $stage -le -3 ] && $train_tree; then + echo "$0: Getting questions for tree clustering." + # preparing questions, roots file... + $cmd $dir/log/questions.log \ + cluster-phones $cluster_phones_opts $context_opts $dir/treeacc \ + $lang/phones/sets.int $dir/questions.int || exit 1; + cat $lang/phones/extra_questions.int >> $dir/questions.int + $cmd $dir/log/compile_questions.log \ + compile-questions --leftmost-questions-truncate=$leftmost_questions_truncate \ + $context_opts $lang/topo $dir/questions.int $dir/questions.qst || exit 1; + + # questions_truncated.int will be needed later on when we build the phone + # language model for 'chain' training. It's a mechanism of keeping the graph + # small. + if [ $leftmost_questions_truncate -gt 0 ]; then + head -n $leftmost_questions_truncate $dir/questions.int > $dir/questions_truncated.int + else + cp $dir/questions.int $dir/questions_truncated.int + fi + + echo "$0: Building the tree" + $cmd $dir/log/build_tree.log \ + build-tree $context_opts --verbose=1 --max-leaves=$numleaves \ + --cluster-thresh=$cluster_thresh $dir/treeacc $lang/phones/roots.int \ + $dir/questions.qst $lang/topo $dir/tree || exit 1; +fi + +if [ $stage -le -2 ]; then + echo "$0: Initializing the model" + gmm-init-model --write-occs=$dir/1.occs \ + $dir/tree $dir/treeacc $lang/topo $dir/1.mdl 2> $dir/log/init_model.log || exit 1; + grep 'no stats' $dir/log/init_model.log && echo "This is a bad warning."; + rm $dir/treeacc +fi + +if [ $stage -le -1 ]; then + # Convert the alignments to the new tree. Note: we likely will not use these + # converted alignments in the CTC system directly, but they could be useful + # for other purposes. + echo "$0: Converting alignments from $lat_dir to use current tree" + $cmd JOB=1:$nj $dir/log/convert.JOB.log \ + lattice-best-path --acoustic-scale=$acwt \ + "ark:gunzip -c $lat_dir/lat.JOB.gz |" ark:/dev/null ark:- \| \ + convert-ali --repeat-frames=$repeat_frames \ + --frame-subsampling-factor=$alignment_subsampling_factor \ + $lat_dir/final.mdl $dir/1.mdl $dir/tree \ + ark:- "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1; +fi + +cp $dir/1.mdl $dir/final.mdl + +echo $0: Done building tree + diff --git a/egs/wsj/s5/steps/nnet3/chain/build_tree_multiple_sources.sh b/egs/wsj/s5/steps/nnet3/chain/build_tree_multiple_sources.sh new file mode 100755 index 00000000000..79c1f654ee4 --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/chain/build_tree_multiple_sources.sh @@ -0,0 +1,283 @@ +#!/bin/bash +# Copyright 2012-2015 Johns Hopkins University (Author: Daniel Povey). +# Apache 2.0. + + +# This script builds a tree for use in the 'chain' systems (although the script +# itself is pretty generic and doesn't use any 'chain' binaries). This is just +# like the first stages of a standard system, like 'train_sat.sh', except it +# does 'convert-ali' to convert alignments to a monophone topology just created +# from the 'lang' directory (in case the topology is different from where you +# got the system's alignments from), and it stops after the tree-building and +# model-initialization stage, without re-estimating the Gaussians or training +# the transitions. + + +# Begin configuration section. +stage=-5 +exit_stage=-100 # you can use this to require it to exit at the + # beginning of a specific stage. Not all values are + # supported. +cmd=run.pl +use_fmllr=true +context_opts= # e.g. set this to "--context-width 5 --central-position 2" for quinphone. +cluster_thresh=-1 # for build-tree control final bottom-up clustering of leaves +frame_subsampling_factor=1 +leftmost_questions_truncate=-1 # note: this used to default to 10, but we never + # use this option now with value != -1, and + # we're changing the default +tree_stats_opts= +cluster_phones_opts= +repeat_frames=false +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +[ -f path.sh ] && . ./path.sh +. parse_options.sh || exit 1; + +if [ $# -lt 5 ]; then + echo "Usage: steps/nnet3/chain/build_tree_multiple_sources.sh <#leaves> [ ... ] " + echo " e.g.: steps/nnet3/chain/build_tree_multiple_sources.sh 15000 data/train_semi data/lang data/train_sup:exp/tri3_ali data/train_unsup:exp/tri3/best_path_train_unsup exp/tree_semi" + echo "Main options (for others, see top of script file)" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --config # config containing options" + echo " --stage # stage to do partial re-run from." + echo " --repeat-frames # Only affects alignment conversion at" + echo " # the end. If true, generate an " + echo " # alignment using the frame-subsampled " + echo " # topology that is repeated " + echo " # --frame-subsampling-factor times " + echo " # and interleaved, to be the same " + echo " # length as the original alignment " + echo " # (useful for cross-entropy training " + echo " # of reduced frame rate systems)." + exit 1; +fi + +numleaves=$1 +lang=$2 +dir=${@: -1} # last argument to the script +shift 2; +data_and_alidirs=( $@ ) # read the remaining arguments into an array +unset data_and_alidirs[${#data_and_alidirs[@]}-1] # 'pop' the last argument which is odir +num_sys=$[${#data_and_alidirs[@]}] # number of systems to combine + +if (( $num_sys % 2 != 0 )); then + echo "$0: The data and alignment arguments must be an even number of arguments." + exit 1 +fi + +num_sys=$((num_sys % 2)) + +data=$dir/data_tmp +mkdir -p $data + +mkdir -p $dir +alidir=`echo ${data_and_alidirs[0]} | cut -d: -s -f2` + +datadirs=() +alidirs=() +for n in `seq 0 $[num_sys-1]`; do + datadirs[$n]=${data_and_alidirs[$[2*n]]} + alidirs[$n]=${data_and_alidirs[$[2*n+1]]} +done + +utils/combine_data.sh $data ${datadirs[@]} || exit 1 + +for f in $data/feats.scp $lang/phones.txt $alidir/final.mdl $alidir/ali.1.gz; do + [ ! -f $f ] && echo "$0: no such file $f" && exit 1; +done + +oov=`cat $lang/oov.int` +nj=`cat $alidir/num_jobs` || exit 1; +silphonelist=`cat $lang/phones/silence.csl` +ciphonelist=`cat $lang/phones/context_indep.csl` || exit 1; +sdata=$data/split$nj; +splice_opts=`cat $alidir/splice_opts 2>/dev/null` # frame-splicing options. +cmvn_opts=`cat $alidir/cmvn_opts 2>/dev/null` || exit 1 +delta_opts=`cat $alidir/delta_opts 2>/dev/null` + +mkdir -p $dir/log +cp $alidir/splice_opts $dir 2>/dev/null # frame-splicing options. +cp $alidir/cmvn_opts $dir 2>/dev/null # cmn/cmvn option. +cp $alidir/delta_opts $dir 2>/dev/null # delta option. + +utils/lang/check_phones_compatible.sh $lang/phones.txt $alidir/phones.txt || exit 1; +cp $lang/phones.txt $dir || exit 1; + +echo $nj >$dir/num_jobs +[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; + +# Set up features. +if [ -f $alidir/final.mat ]; then feat_type=lda; else feat_type=delta; fi + +echo "$0: feature type is $feat_type" + +feats=() +for n in `seq 0 $[num_sys-1]`; do + this_nj=$(cat ${alidirs[$n]}/num_jobs) || exit 1 + this_sdata=${datadirs[$n]}/split$this_nj + [[ -d $this_sdata && ${datadirs[$n]}/feats.scp -ot $this_sdata ]] || split_data.sh ${datadirs[$n]} $this_nj || exit 1; + ## Set up speaker-independent features. + case $feat_type in + delta) feats[$n]="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$this_sdata/JOB/utt2spk scp:$this_sdata/JOB/cmvn.scp scp:$this_sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |";; + lda) feats[$n]="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$this_sdata/JOB/utt2spk scp:$this_sdata/JOB/cmvn.scp scp:$this_sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |" + cp $alidir/final.mat $dir + cp $alidir/full.mat $dir 2>/dev/null + ;; + *) echo "$0: invalid feature type $feat_type" && exit 1; + esac + + if $use_fmllr; then + if [ ! -f $this_alidir/trans.1 ]; then + echo "$0: Could not find fMLLR transforms in $this_alidir" + exit 1 + fi + + echo "$0: Using transforms from $this_alidir" + feats[i]="${feats[i]} transform-feats --utt2spk=ark:$this_sdata/JOB/utt2spk ark,s,cs:$this_alidir/trans.JOB ark:- ark:- |" + fi + + # Do subsampling of feats, if needed + if [ $frame_subsampling_factor -gt 1 ]; then + feats[$n]="${feats[$n]} subsample-feats --n=$frame_subsampling_factor ark:- ark:- |" + fi +done + +if [ $stage -le -5 ]; then + echo "$0: Initializing monophone model (for alignment conversion, in case topology changed)" + + [ ! -f $lang/phones/sets.int ] && exit 1; + shared_phones_opt="--shared-phones=$lang/phones/sets.int" + # get feature dimension + example_feats="`echo ${feats[0]} | sed s/JOB/1/g`"; + if ! feat_dim=$(feat-to-dim "$example_feats" - 2>/dev/null) || [ -z $feat_dim ]; then + feat-to-dim "$example_feats" - # to see the error message. + echo "error getting feature dimension" + exit 1; + fi + + for n in `seq 0 $[num_sys-1]`; do + copy-feats ${feats[$n]} ark:- + done | \ + gmm-init-mono $shared_phones_opt \ + "--train-feats=subset-feats --n=10 ark:- ark:-|" $lang/topo $feat_dim \ + $dir/mono.mdl $dir/mono.tree 2> $dir/log/init_mono.log || exit 1; +fi + + +if [ $stage -le -4 ]; then + # Get tree stats. + + for n in `seq 0 $[num_sys-1]`; do + echo "$0: Accumulating tree stats" + this_data=${datadirs[$n]} + this_alidir=${alidirs[$n]} + this_nj=$(cat $this_alidir/num_jobs) || exit 1 + this_frame_subsampling_factor=1 + if [ -f $this_alidir/frame_subsampling_factor ]; then + this_frame_subsampling_factor=$(cat $this_alidir/frame_subsampling_factor) + fi + + if (( $frame_subsampling_factor % $this_frame_subsampling_factor != 0 )); then + echo "$0: frame-subsampling-factor=$frame_subsampling_factor is not " + echo "divisible by $this_frame_subsampling_factor (that of $this_alidir)" + exit 1 + fi + + this_frame_subsampling_factor=$((frame_subsampling_factor / this_frame_subsampling_factor)) + $cmd JOB=1:$this_nj $dir/log/acc_tree.$n.JOB.log \ + convert-ali --frame-subsampling-factor=$this_frame_subsampling_factor \ + $this_alidir/final.mdl $dir/mono.mdl $dir/mono.tree "ark:gunzip -c $this_alidir/ali.JOB.gz|" ark:- \| \ + acc-tree-stats $context_opts $tree_stats_opts --ci-phones=$ciphonelist $dir/mono.mdl \ + "${feats[$n]}" ark:- $dir/$n.JOB.treeacc || exit 1; + [ "`ls $dir/$n.*.treeacc | wc -w`" -ne "$this_nj" ] && echo "$0: Wrong #tree-accs for data $n $this_data" && exit 1; + done + + $cmd $dir/log/sum_tree_acc.log \ + sum-tree-stats $dir/treeacc $dir/*.treeacc || exit 1; + rm $dir/*.treeacc +fi + +if [ $stage -le -3 ] && $train_tree; then + echo "$0: Getting questions for tree clustering." + # preparing questions, roots file... + $cmd $dir/log/questions.log \ + cluster-phones $cluster_phones_opts $context_opts $dir/treeacc \ + $lang/phones/sets.int $dir/questions.int || exit 1; + cat $lang/phones/extra_questions.int >> $dir/questions.int + $cmd $dir/log/compile_questions.log \ + compile-questions --leftmost-questions-truncate=$leftmost_questions_truncate \ + $context_opts $lang/topo $dir/questions.int $dir/questions.qst || exit 1; + + # questions_truncated.int will be needed later on when we build the phone + # language model for 'chain' training. It's a mechanism of keeping the graph + # small. + if [ $leftmost_questions_truncate -gt 0 ]; then + head -n $leftmost_questions_truncate $dir/questions.int > $dir/questions_truncated.int + else + cp $dir/questions.int $dir/questions_truncated.int + fi + + echo "$0: Building the tree" + $cmd $dir/log/build_tree.log \ + build-tree $context_opts --verbose=1 --max-leaves=$numleaves \ + --cluster-thresh=$cluster_thresh $dir/treeacc $lang/phones/roots.int \ + $dir/questions.qst $lang/topo $dir/tree || exit 1; +fi + +if [ $stage -le -2 ]; then + echo "$0: Initializing the model" + gmm-init-model --write-occs=$dir/1.occs \ + $dir/tree $dir/treeacc $lang/topo $dir/1.mdl 2> $dir/log/init_model.log || exit 1; + grep 'no stats' $dir/log/init_model.log && echo "This is a bad warning."; + rm $dir/treeacc +fi + +if [ $stage -le -1 ]; then + # Convert the alignments to the new tree. Note: we likely will not use these + # converted alignments in the CTC system directly, but they could be useful + # for other purposes. + + for n in `seq 0 $[num_sys-1]`; do + this_alidir=${alidirs[$n]} + this_nj=$(cat $this_alidir/num_jobs) || exit 1 + + this_frame_subsampling_factor=1 + if [ -f $this_alidir/frame_subsampling_factor ]; then + this_frame_subsampling_factor=$(cat $this_alidir/frame_subsampling_factor) + fi + + if (( $frame_subsampling_factor % $this_frame_subsampling_factor != 0 )); then + echo "$0: frame-subsampling-factor=$frame_subsampling_factor is not " + echo "divisible by $this_frame_subsampling_factor (hat of $this_alidir)" + exit 1 + fi + + this_frame_subsampling_factor=$((frame_subsampling_factor / this_frame_subsampling_factor)) + echo "$0: Converting alignments from $alidir to use current tree" + $cmd JOB=1:$this_nj $dir/log/convert.$n.JOB.log \ + convert-ali --repeat-frames=$repeat_frames \ + --frame-subsampling-factor=$this_frame_subsampling_factor \ + $alidir/final.mdl $dir/1.mdl $dir/tree \ + ark,scp:$dir/ali.$n.JOB.ark,$dir/ali.$n.JOB.scp + + for i in `seq $this_nj`; do + cat $dir/ali.$n.$i.scp + done > $dir/ali.$n.scp + done + + for n in `seq 0 $[num_sys-1]`; do + cat $dir/ali.$n.scp + done | sort -k1,1 > $dir/ali.scp + + utils/split_data.sh $data $nj + $cmd JOB=1:$nj $dir/log/copy_alignments.JOB.log \ + copy-int-vector "scp:utils/filter_scp.pl $data/split$nj/JOB/utt2spk $dir/ali.scp |" \ + "ark:| gzip -c > $dir/ali.JOB.gz" || exit 1 +fi + +cp $dir/1.mdl $dir/final.mdl + +echo $0: Done building tree diff --git a/egs/wsj/s5/steps/nnet3/chain/get_egs.sh b/egs/wsj/s5/steps/nnet3/chain/get_egs.sh index 076dc95b2d7..f478d3a811a 100755 --- a/egs/wsj/s5/steps/nnet3/chain/get_egs.sh +++ b/egs/wsj/s5/steps/nnet3/chain/get_egs.sh @@ -71,7 +71,10 @@ egs_weight=1.0 # The weight which determines how much each training example # to down/up-weight a dataset) lattice_prune_beam= # If supplied, the lattices will be pruned to this beam, # before being used to get supervisions. +acwt=0.1 # For pruning phone_insertion_penalty= +deriv_weights_scp= +generate_egs_scp=false echo "$0 $@" # Print the command line for logging @@ -297,7 +300,7 @@ if [ $stage -le 2 ]; then echo "$0: copying training lattices" [ ! -z $lattice_prune_beam ] && \ - prune_cmd="ark:- | lattice-prune --acoustic-scale=0.1 --beam=$lattice_prune_beam ark:-" + prune_cmd="ark:- | lattice-prune --acoustic-scale=$acwt --beam=$lattice_prune_beam ark:-" $cmd --max-jobs-run 6 JOB=1:$num_lat_jobs $dir/log/lattice_copy.JOB.log \ lattice-copy "ark:gunzip -c $latdir/lat.JOB.gz|" $prune_cmd ark,scp:$dir/lat.JOB.ark,$dir/lat.JOB.scp || exit 1; @@ -309,6 +312,7 @@ egs_opts="--left-context=$left_context --right-context=$right_context --num-fram [ $left_context_initial -ge 0 ] && egs_opts="$egs_opts --left-context-initial=$left_context_initial" [ $right_context_final -ge 0 ] && egs_opts="$egs_opts --right-context-final=$right_context_final" +[ ! -z "$deriv_weights_scp" ] && egs_opts="$egs_opts --deriv-weights-rspecifier=scp:$deriv_weights_scp" chain_supervision_all_opts="--lattice-input=true --frame-subsampling-factor=$alignment_subsampling_factor" [ ! -z $right_tolerance ] && \ @@ -317,8 +321,16 @@ chain_supervision_all_opts="--lattice-input=true --frame-subsampling-factor=$ali [ ! -z $left_tolerance ] && \ chain_supervision_all_opts="$chain_supervision_all_opts --left-tolerance=$left_tolerance" -[ ! -z $lattice_lm_scale ] && \ +normalization_scale=1.0 +if [ ! -z "$lattice_lm_scale" ]; then chain_supervision_all_opts="$chain_supervision_all_opts --lm-scale=$lattice_lm_scale" + normalization_scale=$(perl -e " + if ($lattice_lm_scale >= 1.0 || $lattice_lm_scale < 0) { + print STDERR \"Invalid --lattice-lm-scale $lattice_lm_scale\"; + exit(1); + } + print (1.0 - $lattice_lm_scale);") +fi [ ! -z $phone_insertion_penalty ] && \ chain_supervision_all_opts="$chain_supervision_all_opts --phone-ins-penalty=$phone_insertion_penalty" @@ -343,7 +355,7 @@ if [ $stage -le 3 ]; then chain-get-supervision $chain_supervision_all_opts $chaindir/tree $chaindir/0.trans_mdl \ ark:- ark:- \| \ nnet3-chain-get-egs $ivector_opts --srand=$srand \ - $egs_opts $chaindir/normalization.fst \ + $egs_opts --normalization-scale=$normalization_scale $chaindir/normalization.fst \ "$valid_feats" ark,s,cs:- "ark:$dir/valid_all.cegs" || touch $dir/.error & $cmd $dir/log/create_train_subset.log \ utils/filter_scp.pl $dir/train_subset_uttlist $dir/lat_special.scp \| \ @@ -351,27 +363,40 @@ if [ $stage -le 3 ]; then chain-get-supervision $chain_supervision_all_opts \ $chaindir/tree $chaindir/0.trans_mdl ark:- ark:- \| \ nnet3-chain-get-egs $ivector_opts --srand=$srand \ - $egs_opts $chaindir/normalization.fst \ + $egs_opts --normalization-scale=$normalization_scale $chaindir/normalization.fst \ "$train_subset_feats" ark,s,cs:- "ark:$dir/train_subset_all.cegs" || touch $dir/.error & wait; [ -f $dir/.error ] && echo "Error detected while creating train/valid egs" && exit 1 echo "... Getting subsets of validation examples for diagnostics and combination." + if $generate_egs_scp; then + valid_diagnostic_output="ark,scp:$dir/valid_diagnostic.cegs,$dir/valid_diagnostic.scp" + train_diagnostic_output="ark,scp:$dir/train_diagnostic.cegs,$dir/train_diagnostic.scp" + else + valid_diagnostic_output="ark:$dir/valid_diagnostic.cegs" + train_diagnostic_output="ark:$dir/train_diagnostic.cegs" + fi $cmd $dir/log/create_valid_subset_combine.log \ nnet3-chain-subset-egs --n=$num_valid_egs_combine ark:$dir/valid_all.cegs \ ark:$dir/valid_combine.cegs || touch $dir/.error & $cmd $dir/log/create_valid_subset_diagnostic.log \ nnet3-chain-subset-egs --n=$num_egs_diagnostic ark:$dir/valid_all.cegs \ - ark:$dir/valid_diagnostic.cegs || touch $dir/.error & + $valid_diagnostic_output || touch $dir/.error & $cmd $dir/log/create_train_subset_combine.log \ nnet3-chain-subset-egs --n=$num_train_egs_combine ark:$dir/train_subset_all.cegs \ ark:$dir/train_combine.cegs || touch $dir/.error & $cmd $dir/log/create_train_subset_diagnostic.log \ nnet3-chain-subset-egs --n=$num_egs_diagnostic ark:$dir/train_subset_all.cegs \ - ark:$dir/train_diagnostic.cegs || touch $dir/.error & + $train_diagnostic_output || touch $dir/.error & wait sleep 5 # wait for file system to sync. - cat $dir/valid_combine.cegs $dir/train_combine.cegs > $dir/combine.cegs + if $generate_egs_scp; then + cat $dir/valid_combine.cegs $dir/train_combine.cegs | \ + nnet3-chain-copy-egs ark:- ark,scp:$dir/combine.cegs,$dir/combine.scp + rm $dir/{train,valid}_combine.scp + else + cat $dir/valid_combine.cegs $dir/train_combine.cegs > $dir/combine.cegs + fi for f in $dir/{combine,train_diagnostic,valid_diagnostic}.cegs; do [ ! -s $f ] && echo "No examples in file $f" && exit 1; @@ -421,16 +446,34 @@ if [ $stage -le 5 ]; then done if [ $archives_multiple == 1 ]; then # normal case. + if $generate_egs_scp; then + output_archive="ark,scp:$dir/cegs.JOB.ark,$dir/cegs.JOB.scp" + else + output_archive="ark:$dir/cegs.JOB.ark" + fi $cmd --max-jobs-run $max_shuffle_jobs_run --mem 8G JOB=1:$num_archives_intermediate $dir/log/shuffle.JOB.log \ - nnet3-chain-normalize-egs $chaindir/normalization.fst "ark:cat $egs_list|" ark:- \| \ - nnet3-chain-shuffle-egs --srand=\$[JOB+$srand] ark:- ark:$dir/cegs.JOB.ark || exit 1; + nnet3-chain-normalize-egs --normalization-scale=$normalization_scale $chaindir/normalization.fst "ark:cat $egs_list|" ark:- \| \ + nnet3-chain-shuffle-egs --srand=\$[JOB+$srand] ark:- $output_archive || exit 1; + + if $generate_egs_scp; then + #concatenate cegs.JOB.scp in single cegs.scp + rm -rf $dir/cegs.scp + for j in $(seq $num_archives_intermediate); do + cat $dir/cegs.$j.scp || exit 1; + done > $dir/cegs.scp || exit 1; + for f in $dir/cegs.*.scp; do rm $f; done + fi else # we need to shuffle the 'intermediate archives' and then split into the # final archives. we create soft links to manage this splitting, because # otherwise managing the output names is quite difficult (and we don't want # to submit separate queue jobs for each intermediate archive, because then # the --max-jobs-run option is hard to enforce). - output_archives="$(for y in $(seq $archives_multiple); do echo ark:$dir/cegs.JOB.$y.ark; done)" + if $generate_egs_scp; then + output_archives="$(for y in $(seq $archives_multiple); do echo ark,scp:$dir/cegs.JOB.$y.ark,$dir/cegs.JOB.$y.scp; done)" + else + output_archives="$(for y in $(seq $archives_multiple); do echo ark:$dir/cegs.JOB.$y.ark; done)" + fi for x in $(seq $num_archives_intermediate); do for y in $(seq $archives_multiple); do archive_index=$[($x-1)*$archives_multiple+$y] @@ -439,9 +482,20 @@ if [ $stage -le 5 ]; then done done $cmd --max-jobs-run $max_shuffle_jobs_run --mem 8G JOB=1:$num_archives_intermediate $dir/log/shuffle.JOB.log \ - nnet3-chain-normalize-egs $chaindir/normalization.fst "ark:cat $egs_list|" ark:- \| \ + nnet3-chain-normalize-egs --normalization-scale=$normalization_scale $chaindir/normalization.fst "ark:cat $egs_list|" ark:- \| \ nnet3-chain-shuffle-egs --srand=\$[JOB+$srand] ark:- ark:- \| \ nnet3-chain-copy-egs ark:- $output_archives || exit 1; + + if $generate_egs_scp; then + #concatenate cegs.JOB.scp in single cegs.scp + rm -rf $dir/cegs.scp + for j in $(seq $num_archives_intermediate); do + for y in $(seq $num_archives_intermediate); do + cat $dir/cegs.$j.$y.scp || exit 1; + done + done > $dir/cegs.scp || exit 1; + for f in $dir/cegs.*.*.scp; do rm $f; done + fi fi fi diff --git a/egs/wsj/s5/steps/nnet3/chain/make_den_fst.sh b/egs/wsj/s5/steps/nnet3/chain/make_den_fst.sh new file mode 100755 index 00000000000..3467e887cd5 --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/chain/make_den_fst.sh @@ -0,0 +1,86 @@ +#!/bin/bash + +# Copyright 2014-17 Vimal Manohar + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. + +# This script creates denominator FST (den.fst) and normalization.fst for +# chain training. It additional copies the transition model and tree from the +# first alignment directory to the chain directory. +# This script can accept multiple sources of alignments that can be +# weighted to estimate phone LM. + +set -o pipefail + +# begin configuration section. +cmd=run.pl +stage=-10 +weights= +#end configuration section. + +help_message="Usage: "$(basename $0)" [options] [ ...] + E.g. "$(basename $0)" exp/tri1_ali exp/tri2_ali exp/chain/tdnn_1a_sp +Options: + --cmd (run.pl|queue.pl...) # specify how to run the sub-processes. +"; + +[ -f ./path.sh ] && . ./path.sh +. parse_options.sh || exit 1; + +if [ $# -lt 2 ]; then + printf "$help_message\n"; + exit 1; +fi + +dir=${@: -1} # last argument to the script +ali_dirs=( $@ ) # read the remaining arguments into an array +unset ali_dirs[${#ali_dirs[@]}-1] # 'pop' the last argument which is odir +num_sys=${#ali_dirs[@]} # number of systems to combine + +mkdir -p $dir/log + +ali_dir=`echo ${ali_dirs[0]} | cut -d: -f1` + +for f in $ali_dir/ali.1.gz $ali_dir/final.mdl $ali_dir/tree; do + if [ ! -f $f ]; then + echo "$0: Could not find file $f" + exit 1 + fi +done + +cp $ali_dir/tree $dir/ || exit 1 + +for n in `seq 0 $[num_sys-1]`; do + adir=${ali_dirs[$n]} + alignments+=("ark:gunzip -c $adir/ali.*.gz | ali-to-phones $adir/final.mdl ark:- ark:- |") +done + +if [ $stage -le 1 ]; then + $cmd $dir/log/make_phone_lm.log \ + chain-est-phone-lm $lm_opts --scales="$weights" \ + "${alignments[@]}" $dir/phone_lm.fst || exit 1 +fi + +if [ $stage -le 2 ]; then + copy-transition-model $ali_dir/final.mdl $dir/0.trans_mdl +fi + +if [ $stage -le 3 ]; then + $cmd $dir/log/make_den_fst.log \ + chain-make-den-fst $dir/tree $dir/0.trans_mdl \ + $dir/phone_lm.fst \ + $dir/den.fst $dir/normalization.fst || exit 1 +fi + +exit 0 diff --git a/egs/wsj/s5/steps/nnet3/chain/train.py b/egs/wsj/s5/steps/nnet3/chain/train.py index dacfae99a2a..302cbb73eb5 100755 --- a/egs/wsj/s5/steps/nnet3/chain/train.py +++ b/egs/wsj/s5/steps/nnet3/chain/train.py @@ -377,13 +377,23 @@ def train(args, run_opts): logger.info("Copying the properties from {0} to {1}".format(egs_dir, args.dir)) common_train_lib.copy_egs_properties_to_exp_dir(egs_dir, args.dir) + if not os.path.exists('{0}/valid_diagnostic.cegs'.format(egs_dir)): + if (not os.path.exists('{0}/valid_diagnostic.scp'.format(egs_dir))): + raise Exception('neither {0}/valid_diagnostic.cegs nor ' + '{0}/valid_diagnostic.scp exist.' + 'This script expects one of them.'.format(egs_dir)) + use_multitask_egs = True + else: + use_multitask_egs = False + if (args.stage <= -2) and os.path.exists(args.dir+"/configs/init.config"): logger.info('Computing the preconditioning matrix for input features') chain_lib.compute_preconditioning_matrix( args.dir, egs_dir, num_archives, run_opts, max_lda_jobs=args.max_lda_jobs, - rand_prune=args.rand_prune) + rand_prune=args.rand_prune, + use_multitask_egs=use_multitask_egs) if (args.stage <= -1): logger.info("Preparing the initial acoustic model.") @@ -470,7 +480,8 @@ def train(args, run_opts): max_param_change=args.max_param_change, shuffle_buffer_size=args.shuffle_buffer_size, frame_subsampling_factor=args.frame_subsampling_factor, - run_opts=run_opts) + run_opts=run_opts, + use_multitask_egs=use_multitask_egs) if args.cleanup: # do a clean up everythin but the last 2 models, under certain @@ -504,7 +515,8 @@ def train(args, run_opts): l2_regularize=args.l2_regularize, xent_regularize=args.xent_regularize, run_opts=run_opts, - sum_to_one_penalty=args.combine_sum_to_one_penalty) + sum_to_one_penalty=args.combine_sum_to_one_penalty, + use_multitask_egs=use_multitask_egs) if args.cleanup: diff --git a/egs/wsj/s5/steps/nnet3/multilingual/allocate_multilingual_examples.py b/egs/wsj/s5/steps/nnet3/multilingual/allocate_multilingual_examples.py index 6372ba25e5e..860c444e342 100755 --- a/egs/wsj/s5/steps/nnet3/multilingual/allocate_multilingual_examples.py +++ b/egs/wsj/s5/steps/nnet3/multilingual/allocate_multilingual_examples.py @@ -98,6 +98,13 @@ def get_args(): parser.add_argument("--samples-per-iter", type=int, default=40000, help="The target number of egs in each archive of egs, " "(prior to merging egs). ") + parser.add_argument("--frames-per-iter", type=int, default=400000, + help="The target number of frames in each archive of " + "egs") + parser.add_argument("--frames-per-eg-list", type=str, default=None, + action=common_lib.NullstrToNoneAction, + help="Number of frames per eg for each input language " + "as a comma separated list") parser.add_argument("--num-jobs", type=int, default=20, help="This can be used for better randomization in distributing " "examples for different languages across egs.*.scp files, " @@ -107,7 +114,7 @@ def get_args(): help="If true, egs.ranges.*.txt are generated " "randomly w.r.t distribution of remaining examples in " "each language, otherwise it is generated sequentially.", - default=True, choices = ["false", "true"]) + default=True, choices=["false", "true"]) parser.add_argument("--max-archives", type=int, default=1000, help="max number of archives used to generate egs.*.scp") parser.add_argument("--seed", type=int, default=1, @@ -129,7 +136,7 @@ def get_args(): # now the positional arguments parser.add_argument("egs_scp_lists", nargs='+', help="list of egs.scp files per input language." - "e.g. exp/lang1/egs/egs.scp exp/lang2/egs/egs.scp") + "e.g. exp/lang1/egs/egs.scp exp/lang2/egs/egs.scp") parser.add_argument("egs_dir", help="Name of egs directory e.g. exp/tdnn_multilingual_sp/egs") @@ -153,7 +160,7 @@ def select_random_lang(lang_len, tot_egs, random_selection): count = 0 for l in range(len(lang_len)): if random_selection: - if rand_int <= (count + lang_len[l]): + if rand_int <= (count + lang_len[l]): return l else: count += lang_len[l] @@ -172,6 +179,10 @@ def process_multilingual_egs(args): scp_lists = args.egs_scp_lists num_langs = len(scp_lists) + frames_per_eg = ([1 for x in scp_lists] + if args.frames_per_eg_list is None + else [int(x) for x in args.frames_per_eg_list.split(',')]) + scp_files = [open(scp_lists[lang], 'r') for lang in range(num_langs)] lang2len = [0] * num_langs @@ -182,7 +193,7 @@ def process_multilingual_egs(args): # If weights are not provided, the weights are 1.0. if args.lang2weight is None: - lang2weight = [ 1.0 ] * num_langs + lang2weight = [1.0] * num_langs else: lang2weight = args.lang2weight.split(",") assert(len(lang2weight) == num_langs) @@ -195,10 +206,16 @@ def process_multilingual_egs(args): # Each element of all_egs (one per num_archive * num_jobs) is # an array of 3-tuples (lang-id, local-start-egs-line, num-egs) all_egs = [] - lang_len = lang2len[:] - # total num of egs in all languages - tot_num_egs = sum(lang2len[i] for i in range(len(lang2len))) - num_archives = max(1, min(args.max_archives, tot_num_egs / args.samples_per_iter)) + num_frames_in_lang = [frames_per_eg[i] * lang2len[i] + for i in range(num_langs)] + for lang in range(num_langs): + logger.info("Number of frames for language {0} " + "is {1}.".format(lang, num_frames_in_lang[lang])) + + # total num of frames in all languages + tot_num_frames = sum(num_frames_in_lang[i] for i in range(num_langs)) + num_archives = max(1, min(args.max_archives, + tot_num_frames / args.frames_per_iter)) num_arch_file = open("{0}/info/{1}num_archives".format( args.egs_dir, @@ -206,7 +223,7 @@ def process_multilingual_egs(args): "w") print("{0}".format(num_archives), file=num_arch_file) num_arch_file.close() - this_num_egs_per_archive = tot_num_egs / (num_archives * args.num_jobs) + this_num_frames_per_archive = tot_num_frames / (num_archives * args.num_jobs) logger.info("Generating {0}scp.. temporary files used to " "generate {0}.scp.".format(args.egs_prefix)) @@ -216,29 +233,36 @@ def process_multilingual_egs(args): "".format(args.egs_dir, args.egs_prefix, job + 1, archive_index + 1), "w") - this_egs = [] # this will be array of 2-tuples (lang-id start-frame num-frames) + # this will be array of 2-tuples (lang-id start-frame num-frames) + this_egs = [] num_egs = 0 - while num_egs <= this_num_egs_per_archive: - num_left_egs = sum(num_left_egs_per_lang for - num_left_egs_per_lang in lang_len) - if num_left_egs > 0: - lang_id = select_random_lang(lang_len, num_left_egs, rand_select) - start_egs = lang2len[lang_id] - lang_len[lang_id] + num_frames = 0 + while num_frames <= this_num_frames_per_archive: + num_frames_left = sum(num_frames_in_lang) + if num_frames_left > 0: + lang_id = select_random_lang(num_frames_in_lang, + num_frames_left, rand_select) + start_egs = ( + lang2len[lang_id] + - num_frames_in_lang[lang_id] / frames_per_eg[lang_id]) this_egs.append((lang_id, start_egs, args.minibatch_size)) for scpline in range(args.minibatch_size): scp_key = scp_files[lang_id].readline().splitlines()[0] print("{0} {1}".format(scp_key, lang_id), file=archfile) - lang_len[lang_id] = lang_len[lang_id] - args.minibatch_size - num_egs = num_egs + args.minibatch_size + num_frames_in_lang[lang_id] -= ( + args.minibatch_size * frames_per_eg[lang_id]) + num_egs += args.minibatch_size + num_frames += args.minibatch_size * frames_per_eg[lang_id] # If num of remaining egs in each lang is less than minibatch_size, # they are discarded. - if lang_len[lang_id] < args.minibatch_size: - lang_len[lang_id] = 0 - logger.info("Done processing data for language {0}".format( - lang_id)) + if (num_frames_in_lang[lang_id] + < args.minibatch_size * frames_per_eg[lang_id]): + num_frames_in_lang[lang_id] = 0 + logger.info("Done processing data for language {0}" + "".format(lang_id)) else: logger.info("Done processing data for all languages.") break @@ -315,4 +339,4 @@ def main(): if __name__ == "__main__": - main() + main() diff --git a/egs/wsj/s5/steps/nnet3/multilingual/combine_egs.sh b/egs/wsj/s5/steps/nnet3/multilingual/combine_egs.sh index 50148027806..1d19cac14d0 100755 --- a/egs/wsj/s5/steps/nnet3/multilingual/combine_egs.sh +++ b/egs/wsj/s5/steps/nnet3/multilingual/combine_egs.sh @@ -19,13 +19,15 @@ minibatch_size=512 # it is the number of consecutive egs that we take from # access. This does not have to be the actual minibatch size; num_jobs=10 # helps for better randomness across languages # per archive. -samples_per_iter=400000 # this is the target number of egs in each archive of egs +frames_per_iter=400000 # this is the target number of egs in each archive of egs # (prior to merging egs). We probably should have called # it egs_per_iter. This is just a guideline; it will pick # a number that divides the number of samples in the # entire data. lang2weight= # array of weights one per input languge to scale example's output # w.r.t its input language during training. +allocate_opts= +egs_prefix=egs. stage=0 echo "$0 $@" # Print the command line for logging @@ -33,6 +35,12 @@ echo "$0 $@" # Print the command line for logging if [ -f path.sh ]; then . ./path.sh; fi . parse_options.sh || exit 1; +if [ $# -lt 3 ]; then + echo "Usage:$0 [opts] ... " + echo "Usage:$0 [opts] 2 exp/lang1/egs exp/lang2/egs exp/multi/egs" + exit 1; +fi + num_langs=$1 shift 1 @@ -47,7 +55,8 @@ if [ ${#args[@]} != $[$num_langs+1] ]; then exit 1; fi -required="egs.scp combine.scp train_diagnostic.scp valid_diagnostic.scp" +required="${egs_prefix}scp combine.scp train_diagnostic.scp valid_diagnostic.scp" +frames_per_eg_list= train_scp_list= train_diagnostic_scp_list= valid_diagnostic_scp_list= @@ -55,7 +64,7 @@ combine_scp_list= # read paramter from $egs_dir[0]/info and cmvn_opts # to write in multilingual egs_dir. -check_params="info/feat_dim info/ivector_dim info/left_context info/right_context info/frames_per_eg info/final.ie.id cmvn_opts" +check_params="info/feat_dim info/ivector_dim info/left_context info/right_context info/final.ie.id cmvn_opts" for param in $check_params; do cat ${args[0]}/$param > $megs_dir/$param || exit 1; done @@ -69,10 +78,19 @@ for lang in $(seq 0 $[$num_langs-1]);do echo "$0: no such file ${multi_egs_dir[$lang]}/$f." && exit 1; fi done - train_scp_list="$train_scp_list ${args[$lang]}/egs.scp" + train_scp_list="$train_scp_list ${args[$lang]}/${egs_prefix}scp" train_diagnostic_scp_list="$train_diagnostic_scp_list ${args[$lang]}/train_diagnostic.scp" valid_diagnostic_scp_list="$valid_diagnostic_scp_list ${args[$lang]}/valid_diagnostic.scp" combine_scp_list="$combine_scp_list ${args[$lang]}/combine.scp" + + this_frames_per_eg=$(cat ${args[$lang]}/info/frames_per_eg) + + if [ $lang -eq 0 ]; then + frames_per_eg_list="$this_frames_per_eg" + echo $this_frames_per_eg > $megs_dir/info/frames_per_eg + else + frames_per_eg_list="$frames_per_eg_list,$this_frames_per_eg" + fi # check parameter dimension to be the same in all egs dirs for f in $check_params; do @@ -89,16 +107,18 @@ for lang in $(seq 0 $[$num_langs-1]);do done done +if [ ! -z "$lang2weight" ]; then + egs_opt="--lang2weight '$lang2weight'" +fi + if [ $stage -le 0 ]; then echo "$0: allocating multilingual examples for training." - if [ ! -z "$lang2weight" ]; then - egs_opt="--lang2weight '$lang2weight'" - fi - # Generate egs.*.scp for multilingual setup. + # Generate ${egs_prefix}*.scp for multilingual setup. $cmd $megs_dir/log/allocate_multilingual_examples_train.log \ steps/nnet3/multilingual/allocate_multilingual_examples.py $egs_opt \ - --minibatch-size $minibatch_size \ - --samples-per-iter $samples_per_iter \ + ${allocate_opts} --minibatch-size $minibatch_size \ + --frames-per-iter $frames_per_iter --frames-per-eg-list $frames_per_eg_list \ + --egs-prefix "$egs_prefix" \ $train_scp_list $megs_dir || exit 1; fi @@ -106,20 +126,20 @@ if [ $stage -le 1 ]; then echo "$0: combine combine.scp examples from all langs in $megs_dir/combine.scp." # Generate combine.scp for multilingual setup. $cmd $megs_dir/log/allocate_multilingual_examples_combine.log \ - steps/nnet3/multilingual/allocate_multilingual_examples.py \ - --random-lang false \ - --max-archives 1 --num-jobs 1 \ - --minibatch-size $minibatch_size \ + steps/nnet3/multilingual/allocate_multilingual_examples.py $egs_opt \ + --random-lang false --max-archives 1 --num-jobs 1 \ + --frames-per-eg-list $frames_per_eg_list \ + ${allocate_opts} --minibatch-size $minibatch_size \ --egs-prefix "combine." \ $combine_scp_list $megs_dir || exit 1; echo "$0: combine train_diagnostic.scp examples from all langs in $megs_dir/train_diagnostic.scp." # Generate train_diagnostic.scp for multilingual setup. $cmd $megs_dir/log/allocate_multilingual_examples_train_diagnostic.log \ - steps/nnet3/multilingual/allocate_multilingual_examples.py \ - --random-lang false \ - --max-archives 1 --num-jobs 1 \ - --minibatch-size $minibatch_size \ + steps/nnet3/multilingual/allocate_multilingual_examples.py $egs_opt \ + --random-lang false --max-archives 1 --num-jobs 1 \ + --frames-per-eg-list $frames_per_eg_list \ + ${allocate_opts} --minibatch-size $minibatch_size \ --egs-prefix "train_diagnostic." \ $train_diagnostic_scp_list $megs_dir || exit 1; @@ -127,9 +147,10 @@ if [ $stage -le 1 ]; then echo "$0: combine valid_diagnostic.scp examples from all langs in $megs_dir/valid_diagnostic.scp." # Generate valid_diagnostic.scp for multilingual setup. $cmd $megs_dir/log/allocate_multilingual_examples_valid_diagnostic.log \ - steps/nnet3/multilingual/allocate_multilingual_examples.py \ + steps/nnet3/multilingual/allocate_multilingual_examples.py $egs_opt \ --random-lang false --max-archives 1 --num-jobs 1\ - --minibatch-size $minibatch_size \ + --frames-per-eg-list $frames_per_eg_list \ + ${allocate_opts} --minibatch-size $minibatch_size \ --egs-prefix "valid_diagnostic." \ $valid_diagnostic_scp_list $megs_dir || exit 1; @@ -139,6 +160,6 @@ for egs_type in combine train_diagnostic valid_diagnostic; do mv $megs_dir/${egs_type}.weight.1.ark $megs_dir/${egs_type}.weight.ark || exit 1; mv $megs_dir/${egs_type}.1.scp $megs_dir/${egs_type}.scp || exit 1; done -mv $megs_dir/info/egs.num_archives $megs_dir/info/num_archives || exit 1; -mv $megs_dir/info/egs.num_tasks $megs_dir/info/num_tasks || exit 1; +mv $megs_dir/info/${egs_prefix}num_archives $megs_dir/info/num_archives || exit 1; +mv $megs_dir/info/${egs_prefix}num_tasks $megs_dir/info/num_tasks || exit 1; echo "$0: Finished preparing multilingual training example." diff --git a/egs/wsj/s5/steps/subset_ali_dir.sh b/egs/wsj/s5/steps/subset_ali_dir.sh new file mode 100755 index 00000000000..a17a7fbf196 --- /dev/null +++ b/egs/wsj/s5/steps/subset_ali_dir.sh @@ -0,0 +1,48 @@ +#!/bin/bash + +cmd=run.pl + +. path.sh + +. utils/parse_options.sh + +if [ $# -ne 4 ]; then + cat < + e.g.: data/train data/train_sp exp/tri3_ali_sp exp/tri3_ali +EOF +fi + +subset_data=$1 +data=$2 +ali_dir=$3 +dir=$4 + +nj=$(cat $ali_dir/num_jobs) || exit 1 +utils/split_data.sh $data $nj + +$cmd JOB=1:$nj $dir/log/copy_alignments.JOB.log \ + copy-int-vector "ark:gunzip -c $ali_dir/ali.JOB.gz |" \ + ark,scp:$dir/ali_tmp.JOB.ark,$dir/ali_tmp.JOB.scp || exit 1 + +for n in `seq $nj`; do + cat $dir/ali_tmp.$n.scp +done > $dir/ali_tmp.scp + +num_spk=$(cat $subset_data/spk2utt | wc -l) +if [ $num_spk -lt $nj ]; then + nj=$num_spk +fi + +utils/split_data.sh $subset_data $nj +$cmd JOB=1:$nj $dir/log/filter_alignments.JOB.log \ + copy-int-vector \ + "scp:utils/filter_scp.pl $subset_data/split${nj}/JOB/utt2spk $dir/ali_tmp.scp |" \ + "ark:| gzip -c > $dir/ali.JOB.gz" || exit 1 + +rm $dir/ali_tmp.*.{ark,scp} $dir/ali_tmp.scp + +exit 0 diff --git a/egs/wsj/s5/utils/queue.pl b/egs/wsj/s5/utils/queue.pl index 10fd3b1a885..88a2216e3c2 100755 --- a/egs/wsj/s5/utils/queue.pl +++ b/egs/wsj/s5/utils/queue.pl @@ -209,6 +209,8 @@ sub caught_signal { my $qsub_cmd = ""; my $read_command = 0; +my $check_gpu_job = "false"; + while() { chomp; my $line = $_; @@ -237,6 +239,9 @@ sub caught_signal { my $option = $1; # gpu my $value = $2; # 0 my $arg = $3; # -q all.q + if ($option eq "gpu" && $value == 1) { + $check_gpu_job = "true"; + } if (exists $cli_options{$option}) { $cli_default_options{($option,$value)} = $arg; } @@ -381,6 +386,9 @@ sub caught_signal { print Q "time2=\`date +\"%s\"\`\n"; print Q "echo '#' Accounting: time=\$((\$time2-\$time1)) threads=$num_threads >>$logfile\n"; print Q "echo '#' Finished at \`date\` with status \$ret >>$logfile\n"; +print Q "if [ \$ret -ne 0 && $check_gpu_job ]; then\n"; +print Q " ps aux\n"; +print Q "fi\n"; print Q "[ \$ret -eq 137 ] && exit 100;\n"; # If process was killed (e.g. oom) it will exit with status 137; # let the script return with status 100 which will put it to E state; more easily rerunnable. if ($array_job == 0) { # not an array job diff --git a/src/chain/chain-supervision.cc b/src/chain/chain-supervision.cc index 35489ca5e22..fb0f0284df7 100644 --- a/src/chain/chain-supervision.cc +++ b/src/chain/chain-supervision.cc @@ -178,8 +178,9 @@ bool PhoneLatticeToProtoSupervisionInternal(const SupervisionOptions &opts, } proto_supervision->fst.AddArc(state, fst::StdArc(phone, phone, - fst::TropicalWeight(lat_arc.weight.Weight().Value1() - * opts.lm_scale + opts.phone_ins_penalty), + fst::TropicalWeight( + lat_arc.weight.Weight().Value1() + * opts.lm_scale + opts.phone_ins_penalty), lat_arc.nextstate)); int32 t_begin = std::max(0, (state_time - opts.left_tolerance)), t_end = std::min(num_frames, @@ -192,7 +193,7 @@ bool PhoneLatticeToProtoSupervisionInternal(const SupervisionOptions &opts, } if (lat.Final(state) != CompactLatticeWeight::Zero()) { proto_supervision->fst.SetFinal(state, fst::TropicalWeight( - lat.Final(state).Weight().Value1() * opts.lm_scale)); + lat.Final(state).Weight().Value1() * opts.lm_scale)); if (state_times[state] != num_frames) { KALDI_WARN << "Time of final state " << state << " in lattice is " << "not equal to number of frames " << num_frames diff --git a/src/chain/language-model.cc b/src/chain/language-model.cc index 41e06116ea8..d2bb073d764 100644 --- a/src/chain/language-model.cc +++ b/src/chain/language-model.cc @@ -26,7 +26,8 @@ namespace kaldi { namespace chain { -void LanguageModelEstimator::AddCounts(const std::vector &sentence) { +void LanguageModelEstimator::AddCounts(const std::vector &sentence, + int32 weight) { KALDI_ASSERT(opts_.ngram_order >= 2 && "--ngram-order must be >= 2"); KALDI_ASSERT(opts_.ngram_order >= opts_.no_prune_ngram_order); int32 order = opts_.ngram_order; @@ -36,23 +37,23 @@ void LanguageModelEstimator::AddCounts(const std::vector &sentence) { end = sentence.end(); for (; iter != end; ++iter) { KALDI_ASSERT(*iter != 0); - IncrementCount(history, *iter); + IncrementCount(history, *iter, weight); history.push_back(*iter); if (history.size() >= order) history.erase(history.begin()); } // Probability of end of sentence. This will end up getting ignored later, but // it still makes a difference for probability-normalization reasons. - IncrementCount(history, 0); + IncrementCount(history, 0, weight); } void LanguageModelEstimator::IncrementCount(const std::vector &history, - int32 next_phone) { + int32 next_phone, int32 weight) { int32 lm_state_index = FindOrCreateLmStateIndexForHistory(history); if (lm_states_[lm_state_index].tot_count == 0) { num_active_lm_states_++; } - lm_states_[lm_state_index].AddCount(next_phone, 1); + lm_states_[lm_state_index].AddCount(next_phone, weight); } void LanguageModelEstimator::SetParentCounts() { diff --git a/src/chain/language-model.h b/src/chain/language-model.h index b2c3f4cd746..123d5ab830f 100644 --- a/src/chain/language-model.h +++ b/src/chain/language-model.h @@ -91,7 +91,7 @@ class LanguageModelEstimator { // Adds counts for this sentence. Basically does: for each n-gram in the // sentence, count[n-gram] += 1. The only constraint on 'sentence' is that it // should contain no zeros. - void AddCounts(const std::vector &sentence); + void AddCounts(const std::vector &sentence, int32 weight); // Estimates the LM and outputs it as an FST. Note: there is // no concept here of backoff arcs. @@ -188,7 +188,7 @@ class LanguageModelEstimator { // adds the counts for this ngram (called from AddCounts()). inline void IncrementCount(const std::vector &history, - int32 next_phone); + int32 next_phone, int32 weight); // Computes whether backoff should be allowed for this lm_state. (the caller diff --git a/src/chainbin/chain-est-phone-lm.cc b/src/chainbin/chain-est-phone-lm.cc index f16b3f4f14b..db16cc4d51a 100644 --- a/src/chainbin/chain-est-phone-lm.cc +++ b/src/chainbin/chain-est-phone-lm.cc @@ -39,31 +39,52 @@ int main(int argc, char *argv[]) { " chain-est-phone-lm --leftmost-context-questions=dir/leftmost_questions.txt ark:- dir/phone_G.fst\n"; bool binary_write = true; + std::string scales_str; + LanguageModelOptions lm_opts; ParseOptions po(usage); po.Register("binary", &binary_write, "Write output in binary mode"); + po.Register("scales", &scales_str, "Comma-separated list of scales " + "for the different sources of phone sequences"); lm_opts.Register(&po); po.Read(argc, argv); - if (po.NumArgs() != 2) { + if (po.NumArgs() < 2) { po.PrintUsage(); exit(1); } - std::string phone_seqs_rspecifier = po.GetArg(1), - lm_fst_wxfilename = po.GetArg(2); - + int32 num_sources = po.NumArgs() - 1; + + std::string lm_fst_wxfilename = po.GetArg(po.NumArgs()); + + std::vector scales(num_sources, 1); + if (!scales_str.empty()) { + std::vector parts; + SplitStringToVector(scales_str, ":,", false, &parts); + if (parts.size() != num_sources) { + KALDI_ERR << "--scales must have exactly num-sources = " + << num_sources << " scales."; + } + for (size_t i = 0; i < parts.size(); i++) { + scales[i] = std::atoi(parts[i].c_str()); + } + } LanguageModelEstimator lm_estimator(lm_opts); - SequentialInt32VectorReader phones_reader(phone_seqs_rspecifier); - KALDI_LOG << "Reading phone sequences"; - for (; !phones_reader.Done(); phones_reader.Next()) { - const std::vector &phone_seq = phones_reader.Value(); - lm_estimator.AddCounts(phone_seq); + for (int32 n = 1; n <= num_sources; n++) { + std::string phone_seqs_rspecifier = po.GetArg(n); + SequentialInt32VectorReader phones_reader(phone_seqs_rspecifier); + KALDI_LOG << "Reading phone sequences"; + for (; !phones_reader.Done(); phones_reader.Next()) { + const std::vector &phone_seq = phones_reader.Value(); + lm_estimator.AddCounts(phone_seq, scales[n-1]); + } } + KALDI_LOG << "Estimating phone LM"; fst::StdVectorFst fst; lm_estimator.Estimate(&fst); diff --git a/src/chainbin/nnet3-chain-copy-egs.cc b/src/chainbin/nnet3-chain-copy-egs.cc index 4f26e145ac5..c6f643bcae7 100644 --- a/src/chainbin/nnet3-chain-copy-egs.cc +++ b/src/chainbin/nnet3-chain-copy-egs.cc @@ -25,6 +25,40 @@ namespace kaldi { namespace nnet3 { +// rename name of NnetIo with old_name to new_name. +void RenameIoNames(const std::string &old_name, + const std::string &new_name, + NnetChainExample *eg_modified) { + // list of io-names in eg_modified. + std::vector orig_output_names; + int32 output_size = eg_modified->outputs.size(); + for (int32 output_ind = 0; output_ind < output_size; output_ind++) + orig_output_names.push_back(eg_modified->outputs[output_ind].name); + + // find the io in eg with name 'old_name'. + int32 rename_output_ind = + std::find(orig_output_names.begin(), orig_output_names.end(), old_name) - + orig_output_names.begin(); + + if (rename_output_ind >= output_size) + KALDI_ERR << "No io-node with name " << old_name + << "exists in eg."; + eg_modified->outputs[rename_output_ind].name = new_name; +} + +// ranames NnetIo name with name 'output' to new_output_name +// and scales the supervision for 'output' using weight. +void SetWeightAndRenameOutput(BaseFloat weight, + const std::string &new_output_name, + NnetChainExample *eg) { + // scale the supervision weight for egs + for (int32 i = 0; i < eg->outputs.size(); i++) + if (eg->outputs[i].name == "output") + if (weight != 0.0 && weight != 1.0) + eg->outputs[i].supervision.weight *= weight; + // rename output io name to 'new_output_name'. + RenameIoNames("output", new_output_name, eg); +} // returns an integer randomly drawn with expected value "expected_count" // (will be either floor(expected_count) or ceil(expected_count)). @@ -268,6 +302,8 @@ int main(int argc, char *argv[]) { int32 frame_subsampling_factor = -1; BaseFloat keep_proportion = 1.0; int32 left_context = -1, right_context = -1; + std::string eg_weight_rspecifier, eg_output_rspecifier; + ParseOptions po(usage); po.Register("random", &random, "If true, will write frames to output " "archives randomly, not round-robin."); @@ -285,6 +321,15 @@ int main(int argc, char *argv[]) { "feature left-context that we output."); po.Register("right-context", &right_context, "Can be used to truncate the " "feature right-context that we output."); + po.Register("weights", &eg_weight_rspecifier, + "Rspecifier indexed by the key of egs, providing a weight by " + "which we will scale the supervision matrix for that eg. " + "Used in multilingual training."); + po.Register("outputs", &eg_output_rspecifier, + "Rspecifier indexed by the key of egs, providing a string-valued " + "output name, e.g. 'output-0'. If provided, the NnetIo with " + "name 'output' will be renamed to the provided name. Used in " + "multilingual training."); po.Read(argc, argv); srand(srand_seed); @@ -297,6 +342,8 @@ int main(int argc, char *argv[]) { std::string examples_rspecifier = po.GetArg(1); SequentialNnetChainExampleReader example_reader(examples_rspecifier); + RandomAccessTokenReader output_reader(eg_output_rspecifier); + RandomAccessBaseFloatReader egs_weight_reader(eg_weight_rspecifier); int32 num_outputs = po.NumArgs() - 1; std::vector example_writers(num_outputs); @@ -307,8 +354,9 @@ int main(int argc, char *argv[]) { // not configurable for now. exclude_names.push_back(std::string("ivector")); - int64 num_read = 0, num_written = 0; - + int64 num_read = 0, num_written = 0, num_err = 0; + bool modify_eg_output = !(eg_output_rspecifier.empty() && + eg_weight_rspecifier.empty()); for (; !example_reader.Done(); example_reader.Next(), num_read++) { if (frame_subsampling_factor == -1) CalculateFrameSubsamplingFactor(example_reader.Value(), @@ -316,11 +364,41 @@ int main(int argc, char *argv[]) { // count is normally 1; could be 0, or possibly >1. int32 count = GetCount(keep_proportion); std::string key = example_reader.Key(); + NnetChainExample eg_modified_output; + const NnetChainExample &eg_orig = example_reader.Value(), + &eg = (modify_eg_output ? eg_modified_output : eg_orig); + // Note: in the normal case we just use 'eg'; eg_modified_output is + // for the case when the --outputs or --weights option is specified + // (only for multilingual training). + BaseFloat weight = 1.0; + std::string new_output_name; + if (modify_eg_output) { // This branch is only taken for multilingual training. + eg_modified_output = eg_orig; + if (!eg_weight_rspecifier.empty()) { + if (!egs_weight_reader.HasKey(key)) { + KALDI_WARN << "No weight for example key " << key; + num_err++; + continue; + } + weight = egs_weight_reader.Value(key); + } + if (!eg_output_rspecifier.empty()) { + if (!output_reader.HasKey(key)) { + KALDI_WARN << "No new output-name for example key " << key; + num_err++; + continue; + } + new_output_name = output_reader.Value(key); + } + } if (frame_shift == 0 && left_context == -1 && right_context == -1) { - const NnetChainExample &eg = example_reader.Value(); for (int32 c = 0; c < count; c++) { int32 index = (random ? Rand() : num_written) % num_outputs; + if (modify_eg_output) // Only for multilingual training + SetWeightAndRenameOutput(weight, new_output_name, + &eg_modified_output); + example_writers[index]->Write(key, eg); num_written++; } @@ -336,6 +414,8 @@ int main(int argc, char *argv[]) { eg_out.Swap(&eg); for (int32 c = 0; c < count; c++) { int32 index = (random ? Rand() : num_written) % num_outputs; + if (modify_eg_output) + SetWeightAndRenameOutput(weight, new_output_name, &eg_out); example_writers[index]->Write(key, eg_out); num_written++; } diff --git a/src/chainbin/nnet3-chain-get-egs.cc b/src/chainbin/nnet3-chain-get-egs.cc index c8c251900ec..b644ba0aa01 100644 --- a/src/chainbin/nnet3-chain-get-egs.cc +++ b/src/chainbin/nnet3-chain-get-egs.cc @@ -43,6 +43,8 @@ static bool ProcessFile(const fst::StdVectorFst &normalization_fst, const MatrixBase *ivector_feats, int32 ivector_period, const chain::Supervision &supervision, + const VectorBase *deriv_weights, + int32 supervision_length_tolerance, const std::string &utt_id, bool compress, UtteranceSplitter *utt_splitter, @@ -51,7 +53,18 @@ static bool ProcessFile(const fst::StdVectorFst &normalization_fst, int32 num_input_frames = feats.NumRows(), num_output_frames = supervision.frames_per_sequence; - if (!utt_splitter->LengthsMatch(utt_id, num_input_frames, num_output_frames)) + int32 frame_subsampling_factor = utt_splitter->Config().frame_subsampling_factor; + + if (deriv_weights && (std::abs(deriv_weights->Dim() - num_output_frames) + > supervision_length_tolerance)) { + KALDI_WARN << "For utterance " << utt_id + << ", mismatch between deriv-weights dim and num-output-frames" + << "; " << deriv_weights->Dim() << " vs " << num_output_frames; + return false; + } + + if (!utt_splitter->LengthsMatch(utt_id, num_input_frames, num_output_frames, + supervision_length_tolerance)) return false; // LengthsMatch() will have printed a warning. std::vector chunks; @@ -65,8 +78,6 @@ static bool ProcessFile(const fst::StdVectorFst &normalization_fst, return false; } - int32 frame_subsampling_factor = utt_splitter->Config().frame_subsampling_factor; - chain::SupervisionSplitter sup_splitter(supervision); for (size_t c = 0; c < chunks.size(); c++) { @@ -92,19 +103,36 @@ static bool ProcessFile(const fst::StdVectorFst &normalization_fst, int32 first_frame = 0; // we shift the time-indexes of all these parts so // that the supervised part starts from frame 0. + + NnetChainExample nnet_chain_eg; + nnet_chain_eg.outputs.resize(1); SubVector output_weights( &(chunk.output_weights[0]), static_cast(chunk.output_weights.size())); - NnetChainSupervision nnet_supervision("output", supervision_part, - output_weights, - first_frame, - frame_subsampling_factor); + if (!deriv_weights) { + NnetChainSupervision nnet_supervision("output", supervision_part, + output_weights, + first_frame, + frame_subsampling_factor); + nnet_chain_eg.outputs[0].Swap(&nnet_supervision); + } else { + Vector this_deriv_weights(num_frames_subsampled); + for (int32 i = 0; i < num_frames_subsampled; i++) { + int32 t = i + start_frame_subsampled; + if (t < deriv_weights->Dim()) + this_deriv_weights(i) = (*deriv_weights)(t); + } + KALDI_ASSERT(output_weights.Dim() == num_frames_subsampled); + this_deriv_weights.MulElements(output_weights); + NnetChainSupervision nnet_supervision("output", supervision_part, + this_deriv_weights, + first_frame, + frame_subsampling_factor); + nnet_chain_eg.outputs[0].Swap(&nnet_supervision); + } - NnetChainExample nnet_chain_eg; - nnet_chain_eg.outputs.resize(1); - nnet_chain_eg.outputs[0].Swap(&nnet_supervision); nnet_chain_eg.inputs.resize(ivector_feats != NULL ? 2 : 1); int32 tot_input_frames = chunk.left_context + chunk.num_frames + @@ -176,13 +204,15 @@ int main(int argc, char *argv[]) { "chain-get-supervision.\n"; bool compress = true; - int32 length_tolerance = 100, online_ivector_period = 1; + int32 length_tolerance = 100, online_ivector_period = 1, + supervision_length_tolerance = 1; ExampleGenerationConfig eg_config; // controls num-frames, // left/right-context, etc. + BaseFloat scale = 1.0; int32 srand_seed = 0; - std::string online_ivector_rspecifier; + std::string online_ivector_rspecifier, deriv_weights_rspecifier; ParseOptions po(usage); po.Register("compress", &compress, "If true, write egs with input features " @@ -200,6 +230,16 @@ int main(int argc, char *argv[]) { po.Register("srand", &srand_seed, "Seed for random number generator "); po.Register("length-tolerance", &length_tolerance, "Tolerance for " "difference in num-frames between feat and ivector matrices"); + po.Register("supervision-length-tolerance", &supervision_length_tolerance, "Tolerance for " + "difference in num-frames-subsampled between supervision and deriv weights"); + po.Register("deriv-weights-rspecifier", &deriv_weights_rspecifier, + "Per-frame weights (only binary - 0 or 1) that specifies " + "whether a frame's gradient must be backpropagated or not. " + "Not specifying this is equivalent to specifying a vector of " + "all 1s."); + po.Register("normalization-scale", &scale, "Scale the weights from the " + "'normalization' FST before applying them to the examples."); + eg_config.Register(&po); po.Read(argc, argv); @@ -235,6 +275,14 @@ int main(int argc, char *argv[]) { if (!normalization_fst_rxfilename.empty()) { ReadFstKaldi(normalization_fst_rxfilename, &normalization_fst); KALDI_ASSERT(normalization_fst.NumStates() > 0); + + if (scale <= 0.0) { + KALDI_ERR << "Invalid scale on normalization FST; must be > 0.0"; + } + + if (scale != 1.0) { + ScaleFst(scale, &normalization_fst); + } } // Read as GeneralMatrix so we don't need to un-compress and re-compress @@ -245,6 +293,8 @@ int main(int argc, char *argv[]) { NnetChainExampleWriter example_writer(examples_wspecifier); RandomAccessBaseFloatMatrixReader online_ivector_reader( online_ivector_rspecifier); + RandomAccessBaseFloatVectorReader deriv_weights_reader( + deriv_weights_rspecifier); int32 num_err = 0; @@ -278,10 +328,24 @@ int main(int argc, char *argv[]) { num_err++; continue; } + + const Vector *deriv_weights = NULL; + if (!deriv_weights_rspecifier.empty()) { + if (!deriv_weights_reader.HasKey(key)) { + KALDI_WARN << "No deriv weights for utterance " << key; + num_err++; + continue; + } else { + // this address will be valid until we call HasKey() or Value() + // again. + deriv_weights = &(deriv_weights_reader.Value(key)); + } + } if (!ProcessFile(normalization_fst, feats, online_ivector_feats, online_ivector_period, - supervision, key, compress, + supervision, deriv_weights, supervision_length_tolerance, + key, compress, &utt_splitter, &example_writer)) num_err++; } diff --git a/src/chainbin/nnet3-chain-normalize-egs.cc b/src/chainbin/nnet3-chain-normalize-egs.cc index 9d3f56f756a..139c08e7799 100644 --- a/src/chainbin/nnet3-chain-normalize-egs.cc +++ b/src/chainbin/nnet3-chain-normalize-egs.cc @@ -41,7 +41,11 @@ int main(int argc, char *argv[]) { "e.g.\n" "nnet3-chain-normalize-egs dir/normalization.fst ark:train_in.cegs ark:train_out.cegs\n"; + BaseFloat scale = 1.0; + ParseOptions po(usage); + po.Register("normalization-scale", &scale, "Scale the weights from the " + "'normalization' FST before applying them to the examples."); po.Read(argc, argv); @@ -57,6 +61,14 @@ int main(int argc, char *argv[]) { fst::StdVectorFst normalization_fst; ReadFstKaldi(normalization_fst_rxfilename, &normalization_fst); + if (scale <= 0.0) { + KALDI_ERR << "Invalid scale on normalization FST; must be > 0.0"; + } + + if (scale != 1.0) { + ScaleFst(scale, &normalization_fst); + } + SequentialNnetChainExampleReader example_reader(examples_rspecifier); NnetChainExampleWriter example_writer(examples_wspecifier); diff --git a/src/nnet3/nnet-chain-combine.cc b/src/nnet3/nnet-chain-combine.cc index c93858fb06e..67de2b843bb 100644 --- a/src/nnet3/nnet-chain-combine.cc +++ b/src/nnet3/nnet-chain-combine.cc @@ -503,18 +503,18 @@ double NnetChainCombiner::ComputeObjfAndDerivFromNnet( prob_computer_->Reset(); std::vector::const_iterator iter = egs_.begin(), end = egs_.end(); - for (; iter != end; ++iter) + for (; iter != end; ++iter) { prob_computer_->Compute(*iter); - const ChainObjectiveInfo *objf_info = - prob_computer_->GetObjective("output"); - if (objf_info == NULL) - KALDI_ERR << "Error getting objective info (unsuitable egs?)"; - KALDI_ASSERT(objf_info->tot_weight > 0.0); + } + + std::pair p = prob_computer_->GetTotalObjective(); + BaseFloat tot_objf = p.first, tot_weight = p.second; + KALDI_ASSERT(tot_weight > 0.0); const Nnet &deriv = prob_computer_->GetDeriv(); VectorizeNnet(deriv, nnet_params_deriv); // we prefer to deal with normalized objective functions. - nnet_params_deriv->Scale(1.0 / objf_info->tot_weight); - return (objf_info->tot_like + objf_info->tot_l2_term) / objf_info->tot_weight; + nnet_params_deriv->Scale(1.0 / tot_weight); + return tot_objf / tot_weight; } diff --git a/src/nnet3/nnet-chain-diagnostics.cc b/src/nnet3/nnet-chain-diagnostics.cc index 084b33347df..cd3d5894601 100644 --- a/src/nnet3/nnet-chain-diagnostics.cc +++ b/src/nnet3/nnet-chain-diagnostics.cc @@ -207,6 +207,26 @@ bool NnetChainComputeProb::PrintTotalStats() const { } +std::pair NnetChainComputeProb::GetTotalObjective() const { + unordered_map::const_iterator + iter, end; + iter = objf_info_.begin(); + end = objf_info_.end(); + BaseFloat tot_objf = 0.0, tot_weight = 0.0; + for (; iter != end; ++iter) { + const std::string &name = iter->first; + int32 node_index = nnet_.GetNodeIndex(name); + KALDI_ASSERT(node_index >= 0); + const ChainObjectiveInfo &info = iter->second; + BaseFloat like = (info.tot_like / info.tot_weight), + l2_term = (info.tot_l2_term / info.tot_weight); + tot_objf += like + l2_term; + tot_weight += info.tot_weight; + } + return std::make_pair(tot_objf, tot_weight); +} + + const ChainObjectiveInfo* NnetChainComputeProb::GetObjective( const std::string &output_name) const { unordered_map::const_iterator @@ -217,15 +237,29 @@ const ChainObjectiveInfo* NnetChainComputeProb::GetObjective( return NULL; } +static bool HasXentOutputs(const Nnet &nnet) { + const std::vector node_names = nnet.GetNodeNames(); + for (std::vector::const_iterator it = node_names.begin(); + it != node_names.end(); ++it) { + int32 node_index = nnet.GetNodeIndex(*it); + if (nnet.IsOutputNode(node_index) && + it->find("-xent") != std::string::npos) { + return true; + } + } + return false; +} + void RecomputeStats(const std::vector &egs, const chain::ChainTrainingOptions &chain_config_in, const fst::StdVectorFst &den_fst, Nnet *nnet) { KALDI_LOG << "Recomputing stats on nnet (affects batch-norm)"; chain::ChainTrainingOptions chain_config(chain_config_in); - if (nnet->GetNodeIndex("output-xent") != -1 && + if (HasXentOutputs(*nnet) && chain_config.xent_regularize == 0) { - // this forces it to compute the output for 'output-xent', which + // this forces it to compute the output for xent outputs, + // usually 'output-xent', which // means that we'll be computing batch-norm stats for any // components in that branch that have batch-norm. chain_config.xent_regularize = 0.1; diff --git a/src/nnet3/nnet-chain-diagnostics.h b/src/nnet3/nnet-chain-diagnostics.h index 4125427c463..b2962cf87d3 100644 --- a/src/nnet3/nnet-chain-diagnostics.h +++ b/src/nnet3/nnet-chain-diagnostics.h @@ -83,6 +83,9 @@ class NnetChainComputeProb { // or NULL if there is no such info. const ChainObjectiveInfo *GetObjective(const std::string &output_name) const; + // returns the total objective summed over all the outputs + std::pair GetTotalObjective() const; + // if config.compute_deriv == true, returns a reference to the // computed derivative. Otherwise crashes. const Nnet &GetDeriv() const; diff --git a/src/nnet3/nnet-chain-example.cc b/src/nnet3/nnet-chain-example.cc index 351312fb952..d40df1a79f9 100644 --- a/src/nnet3/nnet-chain-example.cc +++ b/src/nnet3/nnet-chain-example.cc @@ -31,8 +31,8 @@ void NnetChainSupervision::Write(std::ostream &os, bool binary) const { WriteToken(os, binary, name); WriteIndexVector(os, binary, indexes); supervision.Write(os, binary); - WriteToken(os, binary, ""); // for DerivWeights. Want to save space. - WriteVectorAsChar(os, binary, deriv_weights); + WriteToken(os, binary, ""); // for DerivWeights. Want to save space. + deriv_weights.Write(os, binary); WriteToken(os, binary, ""); } @@ -51,8 +51,11 @@ void NnetChainSupervision::Read(std::istream &is, bool binary) { ReadToken(is, binary, &token); // in the future this back-compatibility code can be reworked. if (token != "") { - KALDI_ASSERT(token == ""); - ReadVectorAsChar(is, binary, &deriv_weights); + KALDI_ASSERT(token == "" || token == ""); + if (token == "") + ReadVectorAsChar(is, binary, &deriv_weights); + else + deriv_weights.Read(is, binary); ExpectToken(is, binary, ""); } CheckDim(); @@ -82,8 +85,7 @@ void NnetChainSupervision::CheckDim() const { } if (deriv_weights.Dim() != 0) { KALDI_ASSERT(deriv_weights.Dim() == indexes.size()); - KALDI_ASSERT(deriv_weights.Min() >= 0.0 && - deriv_weights.Max() <= 1.0); + KALDI_ASSERT(deriv_weights.Min() >= 0.0); } } diff --git a/src/nnet3/nnet-example-utils.cc b/src/nnet3/nnet-example-utils.cc index 0d4ab0233c2..9b6cca02092 100644 --- a/src/nnet3/nnet-example-utils.cc +++ b/src/nnet3/nnet-example-utils.cc @@ -534,10 +534,12 @@ void UtteranceSplitter::InitSplitForLength() { bool UtteranceSplitter::LengthsMatch(const std::string &utt, int32 utterance_length, - int32 supervision_length) const { + int32 supervision_length, + int32 length_tolerance) const { int32 sf = config_.frame_subsampling_factor, expected_supervision_length = (utterance_length + sf - 1) / sf; - if (supervision_length == expected_supervision_length) { + if (std::abs(supervision_length - expected_supervision_length) + <= length_tolerance) { return true; } else { if (sf == 1) { @@ -1251,6 +1253,21 @@ void ExampleMerger::Finish() { stats_.PrintStats(); } +void ScaleFst(BaseFloat scale, fst::StdVectorFst *fst) { + typedef fst::StdArc Arc; + typedef Arc::StateId StateId; + typedef Arc::Weight Weight; + + for (StateId s = 0; s < fst->NumStates(); s++) { + for (fst::MutableArcIterator aiter(fst, s); + !aiter.Done(); aiter.Next()) { + Arc arc = aiter.Value(); + Weight weight(arc.weight.Value() * scale); + arc.weight = weight; + aiter.SetValue(arc); + } + } +} } // namespace nnet3 } // namespace kaldi diff --git a/src/nnet3/nnet-example-utils.h b/src/nnet3/nnet-example-utils.h index ce7ebd1dd2a..3dcd90eb980 100644 --- a/src/nnet3/nnet-example-utils.h +++ b/src/nnet3/nnet-example-utils.h @@ -190,7 +190,8 @@ class UtteranceSplitter { // supervision_length = (utterance_length + sf - 1) / sf. bool LengthsMatch(const std::string &utt, int32 utterance_length, - int32 supervision_length) const; + int32 supervision_length, + int32 length_tolerance = 0) const; ~UtteranceSplitter(); @@ -515,7 +516,7 @@ class ExampleMerger { MapType eg_to_egs_; }; - +void ScaleFst(BaseFloat scale, fst::StdVectorFst *fst); } // namespace nnet3 } // namespace kaldi From 0a6b8246490c118b83342c68ed5a185b38483f22 Mon Sep 17 00:00:00 2001 From: Pegita Date: Sat, 5 Aug 2017 21:55:40 -0400 Subject: [PATCH 033/174] added new Xconfig layer to parse existing model and modified run_tdnn_wsj_rm_1a.sh to accept input-model. --- .../local/chain/tuning/run_tdnn_wsj_rm_1a.sh | 39 +++++------ .../local/chain/tuning/run_tdnn_wsj_rm_1b.sh | 43 ++++++++---- .../local/chain/tuning/run_tdnn_wsj_rm_1c.sh | 29 +++++--- .../nnet3/train/chain_objf/acoustic_model.py | 14 ++-- egs/wsj/s5/steps/libs/nnet3/train/common.py | 38 ++--------- .../steps/libs/nnet3/xconfig/basic_layers.py | 66 ++++++++++++++++++- egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py | 2 +- egs/wsj/s5/steps/libs/nnet3/xconfig/utils.py | 4 +- egs/wsj/s5/steps/nnet3/chain/train.py | 11 +++- egs/wsj/s5/steps/nnet3/chain/train_more.py | 5 ++ 10 files changed, 165 insertions(+), 86 deletions(-) diff --git a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1a.sh b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1a.sh index 6c224e35458..2f93c078a38 100755 --- a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1a.sh +++ b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1a.sh @@ -31,7 +31,8 @@ xent_regularize=0.1 # configs for transfer learning srcdir=../../wsj/s5/ -common_egs_dir=exp/chain/tdnn_wsj_rm_1c_fixed_ac_scale/egs +common_egs_dir= +#common_egs_dir=exp/chain/tdnn_wsj_rm_1c_fixed_ac_scale/egs src_mdl=$srcdir/exp/chain/tdnn1d_sp/final.mdl primary_lr_factor=0.25 dim=450 @@ -52,6 +53,14 @@ where "nvcc" is installed. EOF fi +required_files="$src_mdl $srcdir/exp/nnet3/extractor/final.mdl" + +for f in $required_files; do + if [ ! -f $f ]; then + echo "$0: no such file $f" + fi +done + # The iVector-extraction and feature-dumping parts are the same as the standard # nnet3 setup, and you can skip them by setting "--stage 8" if you have already # run those things. @@ -105,25 +114,19 @@ if [ $stage -le 7 ]; then cat < $dir/configs/network.xconfig relu-renorm-layer name=tdnn7-target input=Append(tdnn6.renorm@-3,tdnn6.renorm@0) dim=$dim ## adding the layers for chain branch - relu-renorm-layer name=prefinal-chain-target input=tdnn7-target dim=$dim target-rms=0.5 - output-layer name=output-target include-log-softmax=false dim=$num_targets max-change=1.5 - relu-renorm-layer name=prefinal-xent-target input=tdnn7-target dim=$dim target-rms=0.5 - output-layer name=output-xent-target dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 -EOF - # edits.config contains edits required to train transferred model. - # e.g. substitute output-node of previous model with new output - # and removing orphan nodes and components. - cat < $dir/configs/edits.config - remove-output-nodes name=output - remove-output-nodes name=output-xent - rename-node old-name=output-target new-name=output - rename-node old-name=output-xent-target new-name=output-xent - remove-orphans + relu-renorm-layer name=prefinal-chain input=tdnn7-target dim=$dim target-rms=0.5 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 + relu-renorm-layer name=prefinal-xent input=tdnn7-target dim=$dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 EOF steps/nnet3/xconfig_to_configs.py --existing-model $src_mdl \ --xconfig-file $dir/configs/network.xconfig \ - --edits-config $dir/configs/edits.config \ --config-dir $dir/configs/ + + # Set the learning-rate-factor to be primary_lr_factor for initial network." + # and add new layer to initial model + nnet3-copy --edits="set-learning-rate-factor name=* learning-rate-factor=$primary_lr_factor" $src_mdl - | \ + nnet3-init --srand=1 - $dir/configs/final.config $dir/input.raw || exit 1; fi if [ $stage -le 8 ]; then @@ -132,12 +135,10 @@ if [ $stage -le 8 ]; then utils/create_split_dir.pl \ /export/b0{3,4,5,6}/$USER/kaldi-data/egs/rm-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage fi - echo "$0: set the learning-rate-factor for initial network to be zero." - nnet3-copy --edits="set-learning-rate-factor name=* learning-rate-factor=$primary_lr_factor" \ - $src_mdl $dir/init.raw || exit 1; steps/nnet3/chain/train.py --stage $train_stage \ --cmd "$decode_cmd" \ + --trainer.input-model $dir/input.raw \ --feat.online-ivector-dir exp/nnet2${nnet_affix}/ivectors \ --chain.xent-regularize $xent_regularize \ --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ diff --git a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1b.sh b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1b.sh index e1e3cc35ada..1a4ce46ad39 100755 --- a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1b.sh +++ b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1b.sh @@ -1,12 +1,22 @@ #!/bin/bash -# _1b is as _1a but uses a src-tree-dir to generate new target alignment and lattices -# using source model. It also combines -# alignemts from source and target to train phone LM for den.fst in chain denominator graph. +# _1b is as _1a, but different as follows +# 1) uses src phone set phones.txt and new lexicon generated using word pronunciation +# in src lexincon.txt and target word not presented in src are added as oov +# in lexicon.txt. +# 2) It uses src tree-dir and generates new target alignment and lattices using +# src gmm model. +# 3) It also train phone LM using weighted combination of alignemts from source +# and target, which is used in chain denominator graph. +# Since we use phone.txt from source dataset, this can be helpful in cases +# where there is few training data in target and some 4-gram phone sequences +# have no count in target. +# 4) It does not replace the output layer from already-trained model with new +# randomely initialized output layer and and re-train it using target dataset. + # This script uses weight transfer as Transfer learning method -# and use already trained model on wsj and remove the last layer and -# add new randomly initialized layer and retrain the whole network. -# while training new added layer using rm data. +# and use already trained model on wsj and fine-tune the whole network using rm data +# while training the last layer with higher learning-rate. # The chain config is as run_tdnn_5n.sh and the result is: # System tdnn_5n tdnn_wsj_rm_1a tdnn_wsj_rm_1b tdnn_wsj_rm_1c # WER 2.71 2.09 3.45 3.38 @@ -33,11 +43,10 @@ xent_regularize=0.1 # configs for transfer learning common_egs_dir= -#srcdir=../../wsj/s5/ -srcdir=/export/a09/pegahgh/kaldi-transfer-learning/egs/wsj/s5-sp +srcdir=../../wsj/s5/ src_mdl=$srcdir/exp/chain/tdnn1d_sp/final.mdl src_lang=$srcdir/data/lang -src_gmm_mdl=$srcdir/exp/tri4b +src_gmm_dir=$srcdir/exp/tri4b src_tree_dir=$srcdir/exp/chain/tree_a_sp # chain tree-dir for src data; # the alignment in target domain is # converted using src-tree @@ -73,10 +82,18 @@ ali_dir=exp/tri4b${src_tree_dir:+_wsj}_ali lat_dir=exp/tri3b_lats${src_tree_dir:+_wsj} dir=exp/chain/tdnn_wsj_rm${tdnn_affix} +required_files="$src_mdl $src_lang/lexicon.txt $src_gmm_dir/final.mdl $srd_tree_dir/tree" + +for f in $required_files; do + if [ ! -f $f ]; then + echo "$0: no such file $f" + fi +done + if [ $stage -le -1 ]; then echo "$0: prepare lexicon.txt for RM using WSJ lexicon." if ! cmp -s <(grep -v "^#" $src_lang/phones.txt) <(grep -v "^#" data/lang/phones.txt); then - local/prepare_wsj_rm_lang.sh $srcdir/data/local/dict_nosp $srcdir/data/lang $lang_dir + local/prepare_wsj_rm_lang.sh $srcdir/data/local/dict_nosp $srcdir/data/lang $lang_dir else rm -rf $lang_dir cp -r data/lang $lang_dir @@ -92,7 +109,7 @@ local/online/run_nnet2_common.sh --stage $stage \ if [ $stage -le 4 ]; then echo "$0: Generate alignment using source model." steps/align_fmllr.sh --nj 100 --cmd "$train_cmd" \ - data/train $lang_dir $src_gmm_mdl $ali_dir || exit 1; + data/train $lang_dir $src_gmm_dir $ali_dir || exit 1; fi @@ -101,7 +118,7 @@ if [ $stage -le 5 ]; then # use the same num-jobs as the alignments nj=$(cat exp/tri3b_ali/num_jobs) || exit 1; steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/train \ - $lang_dir $src_gmm_mdl $lat_dir || exit 1; + $lang_dir $src_gmm_dir $lat_dir || exit 1; rm $lat_dir/fsts.*.gz # save space fi @@ -126,7 +143,7 @@ if [ $stage -le 7 ]; then utils/create_split_dir.pl \ /export/b0{3,4,5,6}/$USER/kaldi-data/egs/rm-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage fi - echo "$0: set the learning-rate-factor for initial network to be zero." + # set the learning-rate-factor for initial network to be zero." $decode_cmd $dir/log/copy_mdl.log \ nnet3-am-copy --raw=true --edits="set-learning-rate-factor name=* learning-rate-factor=$primary_lr_factor; set-learning-rate-factor name=output* learning-rate-factor=$final_lr_factor" \ $src_mdl $dir/init.raw || exit 1; diff --git a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1c.sh b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1c.sh index a6e1616c25e..dbbf69da9ce 100755 --- a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1c.sh +++ b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1c.sh @@ -1,15 +1,28 @@ #!/bin/bash -# _1c is as _1b but it uses chain model to generate alignment for RM using SWJ model. -# _1b is as _1a but uses a src-tree-dir to convert target alignment and combine -# alignemts from source and target to train phone LM for den.fst in chain denominator graph. +# _1c is as _1b but it uses src chain model instead of GMM model to generate alignment for RM using SWJ model. + +# _1b is as _1a, but different as follows +# 1) uses src phone set phones.txt and new lexicon generated using word pronunciation +# in src lexincon.txt and target word not presented in src are added as oov +# in lexicon.txt. +# 2) It uses src tree-dir and generates new target alignment and lattices using +# src gmm model. +# 3) It also train phone LM using weighted combination of alignemts from source +# and target, which is used in chain denominator graph. +# Since we use phone.txt from source dataset, this can be helpful in cases +# where there is few training data in target and some 4-gram phone sequences +# have no count in target. +# 4) It does not replace the output layer from already-trained model with new +# randomely initialized output layer and and re-train it using target dataset. + # This script uses weight transfer as Transfer learning method -# and use already trained model on wsj and remove the last layer and -# add new randomly initialized layer and retrain the whole network. -# while training new added layer using rm data. +# and use already trained model on wsj and fine-tune the whole network using rm data +# while training the last layer with higher learning-rate. # The chain config is as run_tdnn_5n.sh and the result is: -#System tdnn_5n tdnn_wsj_rm -#WER 2.71 2.21 +# System tdnn_5n tdnn_wsj_rm_1a tdnn_wsj_rm_1b tdnn_wsj_rm_1c +# WER 2.71 2.09 3.45 3.38 + set -e # configs for 'chain' diff --git a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py index 69969ed0535..a5ce15c80b6 100644 --- a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py +++ b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py @@ -459,13 +459,13 @@ def compute_preconditioning_matrix(dir, egs_dir, num_lda_jobs, run_opts, common_lib.force_symlink("../lda.mat", "{0}/configs/lda.mat".format(dir)) -def prepare_initial_acoustic_model(dir, run_opts, srand=-1): +def prepare_initial_acoustic_model(dir, run_opts, srand=-1, input_mdl=None): """ Adds the first layer; this will also add in the lda.mat and presoftmax_prior_scale.vec. It will also prepare the acoustic model with the transition model.""" - - common_train_lib.prepare_initial_network(dir, run_opts, - srand=srand) + if input_mdl is None: + common_train_lib.prepare_initial_network(dir, run_opts, + srand=srand) # The model-format for a 'chain' acoustic model is just the transition # model and then the raw nnet, so we can use 'cat' to create this, as # long as they have the same mode (binary or not binary). @@ -474,8 +474,10 @@ def prepare_initial_acoustic_model(dir, run_opts, srand=-1): # before concatenating them. common_lib.execute_command( """{command} {dir}/log/init_mdl.log \ - nnet3-am-init {dir}/0.trans_mdl {dir}/0.raw \ - {dir}/0.mdl""".format(command=run_opts.command, dir=dir)) + nnet3-am-init {dir}/0.trans_mdl {raw_mdl} \ + {dir}/0.mdl""".format(command=run_opts.command, dir=dir, + raw_mdl=(input_mdl if input_mdl is not None + else '{0}/0.raw'.format(dir)))) def compute_train_cv_probabilities(dir, iter, egs_dir, l2_regularize, diff --git a/egs/wsj/s5/steps/libs/nnet3/train/common.py b/egs/wsj/s5/steps/libs/nnet3/train/common.py index a96f4efaa49..3f4bb7ccf88 100644 --- a/egs/wsj/s5/steps/libs/nnet3/train/common.py +++ b/egs/wsj/s5/steps/libs/nnet3/train/common.py @@ -502,43 +502,13 @@ def smooth_presoftmax_prior_scale_vector(pdf_counts, def prepare_initial_network(dir, run_opts, srand=-3): - """ This function prepares "0.raw" model by adding layers - in final.config. - If dir/init.raw exists, the layers in "final.config" added to "init.raw", - otherwise the new model is initialized using final.config. - If {dir}/configs/edit.config exists, the intermediate model "0.pre-edited.raw" - is generated by modifying "init.raw" by adding layers in "final.config" and - this model is eddited using edits.config as "0.raw". - edits.config applied in cases where the initial network "init.raw" - is copied from another setup (i.e. Weight transfer and we need to rename - output nodes) and there is no init.config in config dir to generate "init.raw". - An example of edits.config is: - i.e. remove-output-nodes name=output - rename-node old-name= new-name=output - """ - if os.path.exists("{0}/init.raw".format(dir)): - edits_config_file = "{0}/configs/edits.config".format(dir) + if os.path.exists(dir+"/configs/init.config"): common_lib.execute_command( """{command} {dir}/log/add_first_layer.log \ nnet3-init --srand={srand} {dir}/init.raw \ - {dir}/configs/final.config {dir}/0{edit_suffix}.raw""".format( + {dir}/configs/final.config {dir}/0.raw""".format( command=run_opts.command, srand=srand, - dir=dir, - edit_suffix = ('.pre-edited' if os.path.exists(edits_config_file) - else ''))) - assert(os.path.exists("{0}/configs/edits.config".format(dir)) or - os.path.exists(edits_config_file)) - if os.path.exists(edits_config_file): - logger.info("edits 0.raw model using {0}/configs/edits.config." - "".format(dir)) - common_lib.execute_command( - """{command} {dir}/log/edit.log \ - nnet3-copy --edits-config={edits_config} {dir}/0.pre-edited.raw \ - {dir}/0.raw - """.format(command=run_opts.command, - dir=dir, - edits_config=edits_config_file)) - + dir=dir)) else: common_lib.execute_command( """{command} {dir}/log/init_model.log \ @@ -886,7 +856,7 @@ def __init__(self, sequentially.""") self.parser.add_argument("--trainer.optimization.backstitch-training-scale", type=float, dest='backstitch_training_scale', - default=0.0, help="""scale of parameters changes + default=0.0, help="""scale of parameters changes used in backstitch training step.""") self.parser.add_argument("--trainer.optimization.backstitch-training-interval", type=int, dest='backstitch_training_interval', diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py index a4f4c1b0323..19797a7f13c 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py @@ -42,7 +42,7 @@ def __init__(self, first_token, key_to_value, all_layers): raise RuntimeError("Invalid value: name={0}".format( key_to_value['name'])) for prev_layer in all_layers: - if self.name == prev_layer.name: + if self.name == prev_layer.name and prev_layer.layer_type is not 'auxiliary': raise RuntimeError("Name '{0}' is used for more than one " "layer.".format(self.name)) @@ -354,6 +354,16 @@ def get_full_config(self): + + +class XconfigTrivialOutputLayer(XconfigLayerBase): + """This class is for lines like + 'output name=output input=Append(input@-1, input@0, input@1, ReplaceIndex(ivector, t, 0))' + This is for outputs that are not really output "layers" + (there is no affine transform or nonlinearity), they just directly map to an + output-node in nnet3. + """ + class XconfigTrivialOutputLayer(XconfigLayerBase): """This class is for lines like 'output name=output input=Append(input@-1, input@0, input@1, ReplaceIndex(ivector, t, 0))' @@ -1057,6 +1067,60 @@ def get_full_config(self): ans.append(('ref', line)) return ans +class XconfigAuxiliaryLayer(XconfigLayerBase): + """This class is for lines like + 'auxiliary name=aux dim=40' + in the config file. + This layer contains dim and name. + This class is useful in cases like transferring + existing models and using {input,output,component}-nodes + of that model as input to new layers. + """ + + def __init__(self, first_token, key_to_value, prev_names = None): + + assert first_token == 'auxiliary' + XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names) + + + def set_default_configs(self): + + self.config = { 'dim': -1} + + def check_configs(self): + + if self.config['dim'] <= 0: + raise RuntimeError("Dimension of auxiliary-layer '{0}'" + "should be positive.".format(self.name)) + + def get_input_descriptor_names(self): + + return [] # there is no 'input' field in self.config. + + def output_name(self, auxiliary_outputs = None): + + # there are no auxiliary outputs as this layer will just pass the input + assert auxiliary_outputs is None + return self.name + + def output_dim(self, auxiliary_outputs = None): + + # there are no auxiliary outputs as this layer will just pass the input + assert auxiliary_outputs is None + return self.config['dim'] + + def get_full_config(self): + + # unlike other layers the input layers need to be printed in + # 'init.config' (which initializes the neural network prior to the LDA) + ans = [] + for config_name in [ 'init', 'ref', 'final' ]: + ans.append( (config_name, + 'auxiliary-node name={0} dim={1}'.format(self.name, + self.config['dim']))) + return ans + + def test_layers(): # for some config lines that should be printed the same way as they diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py index f890dd878a9..2dba6f33949 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py @@ -119,7 +119,7 @@ def get_model_component_info(model_filename): # The layer type is component-node assert(input_str is not None) key_to_value['dim'] = output_dim - all_layers.append(xlayers.XconfigInputLayer('input', key_to_value, all_layers)) + all_layers.append(xlayers.XconfigAuxiliaryLayer('auxiliary', key_to_value, all_layers)) if len(all_layers) == 0: raise RuntimeError("{0}: model filename '{1}' is empty.".format( sys.argv[0], model_filename)) diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/utils.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/utils.py index 759640e37ed..5dcc475a967 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/utils.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/utils.py @@ -14,6 +14,7 @@ # Given a list of objects of type XconfigLayerBase ('all_layers'), # including at least the layers preceding 'current_layer' (and maybe # more layers), return the names of layers preceding 'current_layer' +# regardless of layers type 'auxiliary'. # This will be used in parsing expressions like [-1] in descriptors # (which is an alias for the previous layer). def get_prev_names(all_layers, current_layer): @@ -21,7 +22,8 @@ def get_prev_names(all_layers, current_layer): for layer in all_layers: if layer is current_layer: break - prev_names.append(layer.get_name()) + if layer.layer_type is not 'auxiliary': + prev_names.append(layer.get_name()) prev_names_set = set() for name in prev_names: if name in prev_names_set: diff --git a/egs/wsj/s5/steps/nnet3/chain/train.py b/egs/wsj/s5/steps/nnet3/chain/train.py index e599c981f94..e4cf5bb842a 100755 --- a/egs/wsj/s5/steps/nnet3/chain/train.py +++ b/egs/wsj/s5/steps/nnet3/chain/train.py @@ -101,6 +101,11 @@ def get_args(): help="Deprecated. Kept for back compatibility") # trainer options + parser.add_argument("--trainer.input-model", type=str, + dest='input_model', default=None, + action=common_lib.NullstrToNoneAction, + help="If specified, this model is used as 0.raw model " + " and no LDA matrix or init.raw initialzed.") parser.add_argument("--trainer.num-epochs", type=float, dest='num_epochs', default=10.0, help="Number of epochs to train the model") @@ -309,7 +314,7 @@ def train(args, run_opts): logger.info("Creating denominator FST") chain_lib.create_denominator_fst(args.dir, args.tree_dir, run_opts) - if (args.stage <= -4) and os.path.exists(args.dir+"/configs/init.config"): + if (args.stage <= -4) and os.path.exists(args.dir+"/configs/init.config") and args.input_model is None: logger.info("Initializing a basic network for estimating " "preconditioning matrix") common_lib.execute_command( @@ -377,7 +382,7 @@ def train(args, run_opts): logger.info("Copying the properties from {0} to {1}".format(egs_dir, args.dir)) common_train_lib.copy_egs_properties_to_exp_dir(egs_dir, args.dir) - if (args.stage <= -2) and os.path.exists(args.dir+"/configs/init.config"): + if (args.stage <= -2) and os.path.exists(args.dir+"/configs/init.config") and args.input_model is None: logger.info('Computing the preconditioning matrix for input features') chain_lib.compute_preconditioning_matrix( @@ -387,7 +392,7 @@ def train(args, run_opts): if (args.stage <= -1): logger.info("Preparing the initial acoustic model.") - chain_lib.prepare_initial_acoustic_model(args.dir, run_opts) + chain_lib.prepare_initial_acoustic_model(args.dir, run_opts,input_mdl=args.input_model) with open("{0}/frame_subsampling_factor".format(args.dir), "w") as f: f.write(str(args.frame_subsampling_factor)) diff --git a/egs/wsj/s5/steps/nnet3/chain/train_more.py b/egs/wsj/s5/steps/nnet3/chain/train_more.py index 047a20732b6..1e0d3e6e57f 100755 --- a/egs/wsj/s5/steps/nnet3/chain/train_more.py +++ b/egs/wsj/s5/steps/nnet3/chain/train_more.py @@ -115,6 +115,11 @@ def get_args(): help="Deprecated. Kept for back compatibility") # trainer options + parser.add_argument("--trainer.input-model", type=str, + dest='input_mdl', default=None, + action=common_lib.NullstrToNoneAction, + help="If specified, this model is used as 0.raw model " + " and no LDA matrix or init.raw initialzed.") parser.add_argument("--trainer.num-epochs", type=float, dest='num_epochs', default=10.0, help="Number of epochs to train the model") From e830a04db648d5c3fbf06f1013facf79691267c2 Mon Sep 17 00:00:00 2001 From: Pegita Date: Wed, 9 Aug 2017 19:01:52 -0400 Subject: [PATCH 034/174] modified scripts to accept --trainer.input-model and prepare *.fst outside train.py. the next step is to remove scales from phone lm generation. --- .../local/chain/tuning/run_tdnn_wsj_rm_1a.sh | 6 +- .../local/chain/tuning/run_tdnn_wsj_rm_1b.sh | 67 +- .../local/chain/tuning/run_tdnn_wsj_rm_1c.sh | 48 +- egs/rm/s5/local/prepare_wsj_rm_lang.sh | 47 +- .../nnet3/train/chain_objf/acoustic_model.py | 76 +-- egs/wsj/s5/steps/libs/nnet3/train/common.py | 26 +- egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py | 2 +- .../nnet3/chain/make_weighted_den_fst.sh | 91 +++ egs/wsj/s5/steps/nnet3/chain/train.py | 30 +- egs/wsj/s5/steps/nnet3/chain/train_more.py | 597 ------------------ egs/wsj/s5/steps/nnet3/xconfig_to_configs.py | 101 +-- 11 files changed, 302 insertions(+), 789 deletions(-) create mode 100755 egs/wsj/s5/steps/nnet3/chain/make_weighted_den_fst.sh delete mode 100755 egs/wsj/s5/steps/nnet3/chain/train_more.py diff --git a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1a.sh b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1a.sh index 2f93c078a38..23a3b91fbfe 100755 --- a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1a.sh +++ b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1a.sh @@ -34,7 +34,8 @@ srcdir=../../wsj/s5/ common_egs_dir= #common_egs_dir=exp/chain/tdnn_wsj_rm_1c_fixed_ac_scale/egs src_mdl=$srcdir/exp/chain/tdnn1d_sp/final.mdl -primary_lr_factor=0.25 +primary_lr_factor=0.25 # The learning-rate factor for transferred layers from source + # model. dim=450 nnet_affix=_online # End configuration section. @@ -125,7 +126,8 @@ EOF # Set the learning-rate-factor to be primary_lr_factor for initial network." # and add new layer to initial model - nnet3-copy --edits="set-learning-rate-factor name=* learning-rate-factor=$primary_lr_factor" $src_mdl - | \ + $train_cmd $dir/log/generate_input_mdl.log \ + nnet3-copy --edits="set-learning-rate-factor name=* learning-rate-factor=$primary_lr_factor" $src_mdl - \| \ nnet3-init --srand=1 - $dir/configs/final.config $dir/input.raw || exit 1; fi diff --git a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1b.sh b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1b.sh index 1a4ce46ad39..2aff104a22a 100755 --- a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1b.sh +++ b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1b.sh @@ -1,12 +1,12 @@ #!/bin/bash # _1b is as _1a, but different as follows -# 1) uses src phone set phones.txt and new lexicon generated using word pronunciation -# in src lexincon.txt and target word not presented in src are added as oov -# in lexicon.txt. -# 2) It uses src tree-dir and generates new target alignment and lattices using -# src gmm model. -# 3) It also train phone LM using weighted combination of alignemts from source -# and target, which is used in chain denominator graph. +# 1) uses wsj phone set phones.txt and new lexicon generated using word pronunciation +# in swj lexincon.txt and rm words not presented in wsj are added as oov +# in new lexicon.txt. +# 2) It uses wsj tree-dir and generates new rm alignments and lattices using +# wsj gmm model. +# 3) It also train phone LM using weighted combination of alignemts from wsj +# and rm, which is used in chain denominator graph. # Since we use phone.txt from source dataset, this can be helpful in cases # where there is few training data in target and some 4-gram phone sequences # have no count in target. @@ -24,7 +24,7 @@ set -e # configs for 'chain' stage=7 -train_stage=-10 +train_stage=-4 get_egs_stage=-10 # training options @@ -43,7 +43,8 @@ xent_regularize=0.1 # configs for transfer learning common_egs_dir= -srcdir=../../wsj/s5/ +#srcdir=../../wsj/s5/ +srcdir=/export/a09/pegahgh/kaldi-transfer-learning/egs/wsj/s5-sp src_mdl=$srcdir/exp/chain/tdnn1d_sp/final.mdl src_lang=$srcdir/data/lang src_gmm_dir=$srcdir/exp/tri4b @@ -55,6 +56,9 @@ final_lr_factor=1.0 # learning-rate factor for final layer in transferring sou nnet_affix=_online_wsj tgt_lm_scale=10 src_lm_scale=1 +phone_lm_scales="1,10" # comma-separated list of integer valued scale weights + # to scale different phone sequences for different alignments + # e.g. (src-weight,target-weight)=(10,1) tdnn_affix=_1b # End configuration section. @@ -82,7 +86,7 @@ ali_dir=exp/tri4b${src_tree_dir:+_wsj}_ali lat_dir=exp/tri3b_lats${src_tree_dir:+_wsj} dir=exp/chain/tdnn_wsj_rm${tdnn_affix} -required_files="$src_mdl $src_lang/lexicon.txt $src_gmm_dir/final.mdl $srd_tree_dir/tree" +required_files="$src_mdl $src_lang/phones.txt $srcdir/data/local/dict_nosp/lexicon.txt $src_gmm_dir/final.mdl $src_tree_dir/tree" for f in $required_files; do if [ ! -f $f ]; then @@ -123,42 +127,41 @@ if [ $stage -le 5 ]; then fi if [ $stage -le 6 ]; then - echo "$0: creating neural net configs using the xconfig parser for"; - echo "extra layers w.r.t source network."; - num_targets=$(tree-info $src_tree_dir/tree |grep num-pdfs|awk '{print $2}') - learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) - mkdir -p $dir - mkdir -p $dir/configs - touch $dir/configs/network.xconfig - steps/nnet3/xconfig_to_configs.py --existing-model $src_mdl \ - --xconfig-file $dir/configs/network.xconfig \ - --edits-config $dir/configs/edits.config \ - --config-dir $dir/configs/ + # set the learning-rate-factor for initial network to be primary_lr_factor." + $train_cmd $dir/log/generate_input_mdl.log \ + nnet3-am-copy --raw=true --edits="set-learning-rate-factor name=* learning-rate-factor=$primary_lr_factor; set-learning-rate-factor name=output* learning-rate-factor=1.0" \ + $src_mdl $dir/input.raw || exit 1; fi - if [ $stage -le 7 ]; then + echo "$0: compute {den,normalization}.fst using weighted phone LM with wsj and rm weight $phone_lm_scales." + $train_cmd $dir/log/make_weighted_den_fst.log \ + steps/nnet3/chain/make_weighted_den_fst.sh --weights $phone_lm_scales \ + --lm-opts '--num-extra-lm-states=200' \ + $src_tree_dir $ali_dir $dir || exit 1; +fi + +if [ $stage -le 8 ]; then echo "$0: generate egs for chain to train new model on rm dataset." if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then utils/create_split_dir.pl \ /export/b0{3,4,5,6}/$USER/kaldi-data/egs/rm-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage fi - # set the learning-rate-factor for initial network to be zero." - $decode_cmd $dir/log/copy_mdl.log \ - nnet3-am-copy --raw=true --edits="set-learning-rate-factor name=* learning-rate-factor=$primary_lr_factor; set-learning-rate-factor name=output* learning-rate-factor=$final_lr_factor" \ - $src_mdl $dir/init.raw || exit 1; + # exclude phone_LM and den.fst generation training stage + if [ $train_stage -lt -4 ]; then + train_stage=-4 + fi - steps/nnet3/chain/train_more.py --stage $train_stage \ + steps/nnet3/chain/train.py --stage $train_stage \ --cmd "$decode_cmd" \ + --trainer.input-model $dir/input.raw \ --feat.online-ivector-dir exp/nnet2${nnet_affix}/ivectors \ --chain.xent-regularize $xent_regularize \ - --chain.alignments-for-lm="$ali_dir:$tgt_lm_scale,$src_tree_dir:$src_lm_scale" \ --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ --chain.xent-regularize 0.1 \ --chain.leaky-hmm-coefficient 0.1 \ --chain.l2-regularize 0.00005 \ --chain.apply-deriv-weights false \ - --chain.lm-opts="--num-extra-lm-states=200" \ --egs.dir "$common_egs_dir" \ --egs.opts "--frames-overlap-per-eg 0" \ --egs.chunk-width $frames_per_eg \ @@ -177,12 +180,12 @@ if [ $stage -le 7 ]; then --dir $dir || exit 1; fi -if [ $stage -le 8 ]; then +if [ $stage -le 9 ]; then steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 4 \ data/test_hires $srcdir/exp/nnet3/extractor exp/nnet2${nnet_affix}/ivectors_test || exit 1; fi -if [ $stage -le 9 ]; then +if [ $stage -le 10 ]; then # Note: it might appear that this $lang directory is mismatched, and it is as # far as the 'topo' is concerned, but this script doesn't read the 'topo' from # the lang directory. @@ -194,7 +197,7 @@ if [ $stage -le 9 ]; then $dir/graph data/test_hires $dir/decode || exit 1; fi -if [ $stage -le 10 ]; then +if [ $stage -le 11 ]; then utils/mkgraph.sh --self-loop-scale 1.0 $lang_ug_dir $dir $dir/graph_ug steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ --nj 20 --cmd "$decode_cmd" \ diff --git a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1c.sh b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1c.sh index dbbf69da9ce..b4063cfe014 100755 --- a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1c.sh +++ b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1c.sh @@ -1,5 +1,6 @@ #!/bin/bash -# _1c is as _1b but it uses src chain model instead of GMM model to generate alignment for RM using SWJ model. +# _1c is as _1b but it uses src chain model instead of GMM model to generate +# alignments for RM using SWJ model. # _1b is as _1a, but different as follows # 1) uses src phone set phones.txt and new lexicon generated using word pronunciation @@ -27,7 +28,7 @@ set -e # configs for 'chain' stage=8 -train_stage=-10 +train_stage=-4 get_egs_stage=-10 dir=exp/chain/tdnn_wsj_rm_1c @@ -55,9 +56,11 @@ src_tree_dir=$srcdir/exp/chain/tree_a_sp # chain tree-dir for src data; # the alignment in target domain is # converted using src-tree primary_lr_factor=0.25 # learning-rate factor for all except last layer in transferred source model -final_lr_factor=1.0 # learning-rate factor for final affine layer in transferred source model. nnet_affix=_online_wsj +phone_lm_scales="1,10" # comma-separated list of integer valued scale weights + # to scale different phone sequences for different alignments + # e.g. (src-weight,target-weight)=(10,1) # End configuration section. echo "$0 $@" # Print the command line for logging @@ -127,40 +130,41 @@ if [ $stage -le 5 ]; then fi if [ $stage -le 6 ]; then - echo "$0: creating neural net configs using the xconfig parser and "; - echo "generating $dir/configs/vars."; - num_targets=$(tree-info $src_tree_dir/tree |grep num-pdfs|awk '{print $2}') - mkdir -p $dir - mkdir -p $dir/configs - touch $dir/configs/network.xconfig - steps/nnet3/xconfig_to_configs.py --existing-model $src_mdl \ - --xconfig-file $dir/configs/network.xconfig \ - --edits-config $dir/configs/edits.config \ - --config-dir $dir/configs/ + # set the learning-rate-factor for initial network to be primary_lr_factor." + $train_cmd $dir/log/generate_input_mdl.log \ + nnet3-am-copy --raw=true --edits="set-learning-rate-factor name=* learning-rate-factor=$primary_lr_factor; set-learning-rate-factor name=output* learning-rate-factor=1.0" \ + $src_mdl $dir/input.raw || exit 1; fi if [ $stage -le 7 ]; then + echo "$0: compute {den,normalization}.fst using weighted phone LM." + $train_cmd $dir/log/make_weighted_den_fst.log \ + steps/nnet3/chain/make_weighted_den_fst.sh --weights $phone_lm_scales \ + --lm-opts '--num-extra-lm-states=200' \ + $src_tree_dir $ali_dir $dir || exit 1; +fi + +if [ $stage -le 8 ]; then echo "$0: generate egs for chain to train new model on rm dataset." if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then utils/create_split_dir.pl \ /export/b0{3,4,5,6}/$USER/kaldi-data/egs/rm-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage fi - echo "$0: set the learning-rate-factor for initial network to be zero." - $decode_cmd $dir/log/copy_mdl.log \ - nnet3-am-copy --raw=true --edits="set-learning-rate-factor name=* learning-rate-factor=$primary_lr_factor; set-learning-rate-factor name=output* learning-rate-factor=$final_lr_factor" \ - $src_mdl $dir/init.raw || exit 1; + # exclude phone_LM and den.fst generation training stage + if [ $train_stage -lt -4 ]; then + train_stage=-4 + fi steps/nnet3/chain/train_more.py --stage $train_stage ${chain_opts[@]} \ --cmd "$decode_cmd" \ + --trainer.input-model $dir/input.raw \ --feat.online-ivector-dir exp/nnet2${nnet_affix}/ivectors \ --chain.xent-regularize $xent_regularize \ - --chain.alignments-for-lm="$ali_dir:10,$src_tree_dir:1" \ --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ --chain.xent-regularize 0.1 \ --chain.leaky-hmm-coefficient 0.1 \ --chain.l2-regularize 0.00005 \ --chain.apply-deriv-weights false \ - --chain.lm-opts="--num-extra-lm-states=200" \ --egs.dir "$common_egs_dir" \ --egs.opts "--frames-overlap-per-eg 0" \ --egs.chunk-width $frames_per_eg \ @@ -179,12 +183,12 @@ if [ $stage -le 7 ]; then --dir $dir || exit 1; fi -if [ $stage -le 8 ]; then +if [ $stage -le 9 ]; then steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 4 \ data/test_hires $srcdir/exp/nnet3/extractor exp/nnet2${nnet_affix}/ivectors_test || exit 1; fi -if [ $stage -le 9 ]; then +if [ $stage -le 10 ]; then # Note: it might appear that this $lang directory is mismatched, and it is as # far as the 'topo' is concerned, but this script doesn't read the 'topo' from # the lang directory. @@ -196,7 +200,7 @@ if [ $stage -le 9 ]; then $dir/graph data/test_hires $dir/decode || exit 1; fi -if [ $stage -le 10 ]; then +if [ $stage -le 11 ]; then utils/mkgraph.sh --self-loop-scale 1.0 ${lang_src_tgt}_ug $dir $dir/graph_ug steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ --nj 20 --cmd "$decode_cmd" \ diff --git a/egs/rm/s5/local/prepare_wsj_rm_lang.sh b/egs/rm/s5/local/prepare_wsj_rm_lang.sh index 10a01514ca0..9d3c8dd3c2d 100755 --- a/egs/rm/s5/local/prepare_wsj_rm_lang.sh +++ b/egs/rm/s5/local/prepare_wsj_rm_lang.sh @@ -1,9 +1,10 @@ #!/bin/bash # Copyright 2017 Pegah Ghahremani -# This script prepares a dictionary for wsj-rm experiment using wsj phone set, lexicon and dict and -# rm's words.txt are copied from wsj lexicon for common words in wsj -# and rm. words in rm that are not available in the wsj lexicon are added +# This script prepares a dictionary for wsj to rm transfer learning experiment +# which uses wsj phone set, lexicon and dict and +# the lexicon for rm words.txt are copied from wsj lexicon for common words in wsj +# and rm. words in rm that are not in the wsj lexicon are added # as oov in lexicon.txt. # The oov word "" in wsj is also added to words.txt and G.fst is recompiled using # updated word list. @@ -12,14 +13,14 @@ if [ -f path.sh ]; then . ./path.sh; fi . utils/parse_options.sh if [ $# != 3 ]; then - echo "Usage: local/prepare_wsj_rm_lang.sh " + echo "Usage: local/prepare_wsj_rm_lang.sh " echo "e.g:" echo "$0 ../../wsj/s5/data/local/dict ../../wsj/s5/data/lang_nosp data/wsj_rm_dir" fi src_dict=$1 src_lang=$2 -src_tgt_lang=$3 +output_dir=$3 required_dict_files="$src_dict/lexicon.txt $src_dict/nonsilence_phones.txt $src_dict/silence_phones.txt $src_dict/optional_silence.txt $src_lang/oov.txt $src_lang/phones.txt" for f in $required_dict_files; do @@ -28,41 +29,41 @@ for f in $required_dict_files; do fi done -rm -rf $src_tgt_lang -mkdir -p $src_tgt_lang -mkdir -p $src_tgt_lang/local +rm -r $output_dir 2>/dev/null +mkdir -p $output_dir +mkdir -p $output_dir/local # copy *phones.txt from source to target. -cp -r $src_dict $src_tgt_lang/local/dict -rm $src_tgt_lang/local/dict/lexicon*.txt +cp -r $src_dict $output_dir/local/dict +rm $output_dir/local/dict/lexicon*.txt oov_word=`cat $src_lang/oov.txt` # common word list in rm lexicon with lexicon in wsj comm -12 <(awk '{print $1}' data/local/dict/lexicon.txt | sed "s/\+/\'/g" | sort) \ -<(awk '{print $1}' $src_dict/lexicon.txt | sort) | \ -sed -r "s/'/+/g" | sort > $src_tgt_lang/words_tmp.txt + <(awk '{print $1}' $src_dict/lexicon.txt | sort) | \ + sed -r "s/'/+/g" | sort > $output_dir/words_tmp.txt comm -23 <(awk '{print $1}' data/local/dict/lexicon.txt | sed "s/\+/\'/g" | sort) \ -<(awk '{print $1}' $src_dict/lexicon.txt | sort) | \ -sed -r "s/'/+/g" | sort > $src_tgt_lang/words_only_tgt.txt + <(awk '{print $1}' $src_dict/lexicon.txt | sort) | \ + sed -r "s/'/+/g" | sort > $output_dir/words_only_tgt.txt -# add to rm_swj_word list -(echo "$oov_word"; cat $src_tgt_lang/words_tmp.txt) | sort > $src_tgt_lang/words_tgt_src.txt -rm $src_tgt_lang/words_tmp.txt +# add oov_word to word list +(echo "$oov_word"; cat $output_dir/words_tmp.txt) | sort > $output_dir/words_tgt_src.txt +rm $output_dir/words_tmp.txt # we use wsj lexicon and find common word list in rm and wsj to generate lexicon for rm-wsj # using wsj phone sets. More than 90% of words in RM are in WSJ(950/994). -cat $src_tgt_lang/words_tgt_src.txt | sed "s/\+/\'/g" | \ +cat $output_dir/words_tgt_src.txt | sed "s/\+/\'/g" | \ utils/apply_map.pl --permissive $src_dict/lexicon.txt | \ -paste <(cat $src_tgt_lang/words_tgt_src.txt) - > $src_tgt_lang/local/dict/lexicon_tgt_src.txt + paste <(cat $output_dir/words_tgt_src.txt) - > $output_dir/local/dict/lexicon_tgt_src.txt # extend lexicon.txt by adding only_tg words as oov. oov_phone=`grep "$oov_word" $src_dict/lexicon.txt | cut -d' ' -f2` -cat $src_tgt_lang/local/dict/lexicon_tgt_src.txt <(sed 's/$/ SPN/g' $src_tgt_lang/words_only_tgt.txt) | sort -u > $src_tgt_lang/local/dict/lexicon.txt +cat $output_dir/local/dict/lexicon_tgt_src.txt <(sed 's/$/ SPN/g' $output_dir/words_only_tgt.txt) | sort -u > $output_dir/local/dict/lexicon.txt # prepare dictionary using new lexicon.txt for RM-SWJ. utils/prepare_lang.sh --phone-symbol-table $src_lang/phones.txt \ -$src_tgt_lang/local/dict "$oov_word" $src_tgt_lang/local/lang_tmp $src_tgt_lang + $output_dir/local/dict "$oov_word" $output_dir/local/lang_tmp $output_dir # Generate new G.fst using updated words list with added -fstcompile --isymbols=$src_tgt_lang/words.txt --osymbols=$src_tgt_lang/words.txt --keep_isymbols=false \ - -keep_osymbols=false data/local/tmp/G.txt | fstarcsort --sort_type=ilabel > $src_tgt_lang/G.fst || exit 1; +fstcompile --isymbols=$output_dir/words.txt --osymbols=$output_dir/words.txt --keep_isymbols=false \ + -keep_osymbols=false data/local/tmp/G.txt | fstarcsort --sort_type=ilabel > $output_dir/G.fst || exit 1; diff --git a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py index a5ce15c80b6..4912ac12356 100644 --- a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py +++ b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py @@ -12,7 +12,6 @@ import math import os import sys -import filecmp import libs.common as common_lib import libs.nnet3.train.common as common_train_lib @@ -21,64 +20,32 @@ logger.addHandler(logging.NullHandler()) -def create_phone_lm(dir, tree_dir, run_opts, alignment_dirs=None, lm_opts=None): - """Create a phone LM for chain training. +def create_phone_lm(dir, tree_dir, run_opts, lm_opts=None): + """Create a phone LM for chain training - This method trains a phone LM usingalignments. - If alignment_dirs is non empty, it trains weighted phone LM for chain - training generated by weighted combination of counts in alignments provided - in "alignments_dir", which is a list of comma-separated alignment directories - with same phone sets (each containing tree , final.mdl and ali.*.gz) - and the colon-separated integer weights appended - to each alignment dir.(if none, the weight is 1) - i.e. alignment_dirs="ali_dir1:w1,ali_dir2:w2" - If "alignment_dirs is empty, phone LM generated using alignments in tree_dir. + This method trains a phone LM for chain training using the alignments + in "tree_dir" """ - ali_dirs = [] - scales = [] - if alignment_dirs is not None: - ali_weight_dirs = alignment_dirs.split(",") - for ali_weight_dir in ali_weight_dirs: - ali_and_weights = ali_weight_dir.split(":") - ali_dirs.append(ali_and_weights[0]) - if (len(ali_and_weights) == 2): - assert(ali_and_weights[1].isdigit()) - scales.append(ali_and_weights[1]) - else: - scales.append("1") - else: - ali_dirs.append(tree_dir) - - phone_alignments_list = [] - for ali_dir in ali_dirs: - try: - f = open(ali_dir + "/num_jobs", 'r') - num_ali_jobs = int(f.readline()) - assert num_ali_jobs > 0 - except: - raise Exception("""There was an error getting the number of alignment - jobs from {0}/num_jobs""".format(ali_dir)) - - common_lib.execute_command( - """ utils/lang/check_phones_compatible.sh \ - {0}/phones.txt {1}/phones.txt""".format(tree_dir, ali_dir)) - - alignments=' '.join(['{0}/ali.{1}.gz'.format(ali_dir, job) - for job in range(1, num_ali_jobs + 1)]) - - phone_alignments_list.append("""'ark:gunzip -c {0} | ali-to-phones {1}/final.mdl ark:- ark:-|'""".format(alignments, ali_dir)) + try: + f = open(tree_dir + "/num_jobs", 'r') + num_ali_jobs = int(f.readline()) + assert num_ali_jobs > 0 + except: + raise Exception("""There was an error getting the number of alignment + jobs from {0}/num_jobs""".format(tree_dir)) - phone_alignments_list_str=' '.join(phone_alignments_list) - if len(scales) > 0: - scales_str=','.join(scales) - lm_opts="{0} --scales={1}".format(lm_opts, scales_str) + alignments=' '.join(['{0}/ali.{1}.gz'.format(tree_dir, job) + for job in range(1, num_ali_jobs + 1)]) common_lib.execute_command( """{command} {dir}/log/make_phone_lm.log \ - chain-est-phone-lm {lm_opts} {phone_alignments_list} {dir}/phone_lm.fst""".format( - command=run_opts.command, dir=dir, - lm_opts=lm_opts if lm_opts is not None else '', - phone_alignments_list=phone_alignments_list_str)) + gunzip -c {alignments} \| \ + ali-to-phones {tree_dir}/final.mdl ark:- ark:- \| \ + chain-est-phone-lm {lm_opts} ark:- {dir}/phone_lm.fst""".format( + command=run_opts.command, dir=dir, + alignments=alignments, + lm_opts=lm_opts if lm_opts is not None else '', + tree_dir=tree_dir)) def create_denominator_fst(dir, tree_dir, run_opts): @@ -398,7 +365,7 @@ def check_for_required_files(feat_dir, tree_dir, lat_dir): files = ['{0}/feats.scp'.format(feat_dir), '{0}/ali.1.gz'.format(tree_dir), '{0}/final.mdl'.format(tree_dir), '{0}/tree'.format(tree_dir), '{0}/lat.1.gz'.format(lat_dir), '{0}/final.mdl'.format(lat_dir), - '{0}/num_jobs'.format(lat_dir)] + '{0}/num_jobs'.format(lat_dir), '{0}/splice_opts'.format(lat_dir)] for file in files: if not os.path.isfile(file): raise Exception('Expected {0} to exist.'.format(file)) @@ -466,6 +433,7 @@ def prepare_initial_acoustic_model(dir, run_opts, srand=-1, input_mdl=None): if input_mdl is None: common_train_lib.prepare_initial_network(dir, run_opts, srand=srand) + # The model-format for a 'chain' acoustic model is just the transition # model and then the raw nnet, so we can use 'cat' to create this, as # long as they have the same mode (binary or not binary). diff --git a/egs/wsj/s5/steps/libs/nnet3/train/common.py b/egs/wsj/s5/steps/libs/nnet3/train/common.py index 3f4bb7ccf88..e7b7c5337b8 100644 --- a/egs/wsj/s5/steps/libs/nnet3/train/common.py +++ b/egs/wsj/s5/steps/libs/nnet3/train/common.py @@ -357,6 +357,30 @@ def parse_generic_config_vars_file(var_file): raise Exception('Error while parsing the file {0}'.format(var_file)) +def parse_input_model(input_model): + variables = {} + try: + assert(os.path.exists(input_model)) + out = common_lib.get_command_stdout("""nnet3-info {0} | """ + """head -4 """.format(input_model)) + # out looks like this + # left-context: 7 + # right-context: 0 + # num-parameters: 90543902 + # modulus: 1 + for line in out.split("\n"): + parts = line.split(":") + if len(parts) != 2: + continue + if parts[0].strip() == 'left-context': + variables['model_left_context'] = int(parts[1].strip()) + elif parts[0].strip() == 'right-context': + variables['model_right_context'] = int(parts[1].strip()) + + except ValueError: + pass + return variables + def verify_egs_dir(egs_dir, feat_dim, ivector_dim, ivector_extractor_id, left_context, right_context, left_context_initial=-1, right_context_final=-1): @@ -856,7 +880,7 @@ def __init__(self, sequentially.""") self.parser.add_argument("--trainer.optimization.backstitch-training-scale", type=float, dest='backstitch_training_scale', - default=0.0, help="""scale of parameters changes + default=0.0, help="""scale of parameters changes used in backstitch training step.""") self.parser.add_argument("--trainer.optimization.backstitch-training-interval", type=int, dest='backstitch_training_interval', diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py index 2dba6f33949..96f5e824f1e 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py @@ -131,7 +131,7 @@ def get_model_component_info(model_filename): # (usually we use the variable name 'all_layers' elsewhere for this). # It will die if the xconfig file is empty or if there was # some error parsing it. -# aux_layers is a list of auxilary layers(component-nodes or input-node) that +# aux_layers is a list of auxilary layers({component,input,output}-node) # can be used as input to component-nodes used in xconfig_file. def read_xconfig_file(xconfig_filename, aux_layers=[]): try: diff --git a/egs/wsj/s5/steps/nnet3/chain/make_weighted_den_fst.sh b/egs/wsj/s5/steps/nnet3/chain/make_weighted_den_fst.sh new file mode 100755 index 00000000000..67d6eefce5b --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/chain/make_weighted_den_fst.sh @@ -0,0 +1,91 @@ +#!/bin/bash + +# Copyright 2014-17 Vimal Manohar +# 2017 Pegah Ghahremani +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. + +# This script creates denominator FST (den.fst) and normalization.fst for +# chain training. It additional copies the transition model and tree from the +# first alignment directory to the chain directory. +# This script can accept multiple sources of alignments with same phone sets +# that can be weighted to estimate phone LM. +# Each alignment directory should contain tree, final,mdl and ali.*.gz. + +set -o pipefail + +# begin configuration section. +cmd=run.pl +stage=-10 +weights= # comma-separated list of integer valued scale weights used + # to scale different phone sequences for different alignments. +lm_opts='num_extra_lm_state=2000' +#end configuration section. + +help_message="Usage: "$(basename $0)" [options] [ ...] + E.g. "$(basename $0)" exp/tri1_ali exp/tri2_ali exp/chain/tdnn_1a_sp +Options: + --cmd (run.pl|queue.pl...) # specify how to run the sub-processes. +"; + +[ -f ./path.sh ] && . ./path.sh +. parse_options.sh || exit 1; + +if [ $# -lt 2 ]; then + printf "$help_message\n"; + exit 1; +fi + +dir=${@: -1} # last argument to the script +ali_dirs=( $@ ) # read the remaining arguments into an array +unset ali_dirs[${#ali_dirs[@]}-1] # 'pop' the last argument which is odir +num_alignments=${#ali_dirs[@]} # number of systems to combine + +mkdir -p $dir/log +for n in `seq 0 $[$num_alignments-1]`;do + ali_dir=${ali_dirs[$n]} + for f in $ali_dir/ali.1.gz $ali_dir/final.mdl $ali_dir/tree; do + if [ ! -f $f ]; then + echo "$0: Could not find file $f" + exit 1 + fi + done + utils/lang/check_phones_compatible.sh ${ali_dirs[0]}/phones.txt \ + ${ali_dirs[$n]}/phones.txt +done + +cp ${ali_dirs[0]}/tree $dir/ || exit 1 + +for n in `seq 0 $[num_alignments-1]`; do + adir=${ali_dirs[$n]} + alignments+=("ark:gunzip -c $adir/ali.*.gz | ali-to-phones $adir/final.mdl ark:- ark:- |") +done + +if [ $stage -le 1 ]; then + $cmd $dir/log/make_phone_lm.log \ + chain-est-phone-lm $lm_opts --scales="$weights" \ + "${alignments[@]}" $dir/phone_lm.fst || exit 1 +fi + +if [ $stage -le 2 ]; then + copy-transition-model ${ali_dirs[0]}/final.mdl $dir/0.trans_mdl +fi + +if [ $stage -le 3 ]; then + $cmd $dir/log/make_den_fst.log \ + chain-make-den-fst $dir/tree $dir/0.trans_mdl \ + $dir/phone_lm.fst \ + $dir/den.fst $dir/normalization.fst || exit 1 +fi + +exit 0 diff --git a/egs/wsj/s5/steps/nnet3/chain/train.py b/egs/wsj/s5/steps/nnet3/chain/train.py index e4cf5bb842a..7dd75171783 100755 --- a/egs/wsj/s5/steps/nnet3/chain/train.py +++ b/egs/wsj/s5/steps/nnet3/chain/train.py @@ -105,7 +105,9 @@ def get_args(): dest='input_model', default=None, action=common_lib.NullstrToNoneAction, help="If specified, this model is used as 0.raw model " - " and no LDA matrix or init.raw initialzed.") + "and no LDA matrix or init.raw initialzed." + "Also configs dir is not expected to exist " + "and context is generated using this model.") parser.add_argument("--trainer.num-epochs", type=float, dest='num_epochs', default=10.0, help="Number of epochs to train the model") @@ -213,11 +215,13 @@ def process_args(args): args.deriv_truncate_margin)) if (not os.path.exists(args.dir) - or not os.path.exists(args.dir+"/configs")): - raise Exception("This scripts expects {0} to exist and have a configs " + or (not os.path.exists(args.dir+"/configs") and + not os.path.exists(args.input_model))): + raise Exception("This scripts expects {0} to exist. Also either of --trainer.input-model " + " as '0.raw' model should exist or {0} should have a configs " "directory which is the output of " - "make_configs.py script".format( - args.dir)) + "make_configs.py script.".format( + args.dir, args.input_model)) if args.transform_dir is None: args.transform_dir = args.lat_dir @@ -277,14 +281,18 @@ def train(args, run_opts): # we will use the same number of jobs as that used for alignment common_lib.execute_command("utils/split_data.sh {0} {1}".format( args.feat_dir, num_jobs)) - shutil.copy('{0}/tree'.format(args.tree_dir), args.dir) with open('{0}/num_jobs'.format(args.dir), 'w') as f: f.write(str(num_jobs)) - config_dir = '{0}/configs'.format(args.dir) - var_file = '{0}/vars'.format(config_dir) + if args.input_model is None: + config_dir = '{0}/configs'.format(args.dir) + var_file = '{0}/vars'.format(config_dir) - variables = common_train_lib.parse_generic_config_vars_file(var_file) + variables = common_train_lib.parse_generic_config_vars_file(var_file) + else: + # if args.input_model specified, the model left and right context + # computed using input_model. + variables = common_train_lib.parse_input_model(args.input_model) # Set some variables. try: @@ -312,6 +320,7 @@ def train(args, run_opts): if (args.stage <= -5): logger.info("Creating denominator FST") + shutil.copy('{0}/tree'.format(args.tree_dir), args.dir) chain_lib.create_denominator_fst(args.dir, args.tree_dir, run_opts) if (args.stage <= -4) and os.path.exists(args.dir+"/configs/init.config") and args.input_model is None: @@ -336,6 +345,9 @@ def train(args, run_opts): default_egs_dir = '{0}/egs'.format(args.dir) if (args.stage <= -3) and args.egs_dir is None: logger.info("Generating egs") + assert(os.path.exists("{0}/den.fst".format(args.dir)) and + os.path.exists("{0}/normalization.fst".format(args.dir)) and + os.path.exists("{0}/tree".format(args.dir))) # this is where get_egs.sh is called. chain_lib.generate_chain_egs( dir=args.dir, data=args.feat_dir, diff --git a/egs/wsj/s5/steps/nnet3/chain/train_more.py b/egs/wsj/s5/steps/nnet3/chain/train_more.py deleted file mode 100755 index 1e0d3e6e57f..00000000000 --- a/egs/wsj/s5/steps/nnet3/chain/train_more.py +++ /dev/null @@ -1,597 +0,0 @@ -#!/usr/bin/env python - -# Copyright 2016 Vijayaditya Peddinti. -# 2016 Vimal Manohar -# Apache 2.0. - -""" This script is as steps/nnet3/chain/train.py but it is used for weight transfer - and generates weighted phone LM for chain model using multiple alignment sources. -""" - -import argparse -import logging -import os -import pprint -import shutil -import sys -import traceback - -sys.path.insert(0, 'steps') -import libs.nnet3.train.common as common_train_lib -import libs.common as common_lib -import libs.nnet3.train.chain_objf.acoustic_model as chain_lib -import libs.nnet3.report.log_parse as nnet3_log_parse - - -logger = logging.getLogger('libs') -logger.setLevel(logging.INFO) -handler = logging.StreamHandler() -handler.setLevel(logging.INFO) -formatter = logging.Formatter("%(asctime)s [%(pathname)s:%(lineno)s - " - "%(funcName)s - %(levelname)s ] %(message)s") -handler.setFormatter(formatter) -logger.addHandler(handler) -logger.info('Starting chain model trainer (train.py)') - - -def get_args(): - """ Get args from stdin. - - We add compulsary arguments as named arguments for readability - - The common options are defined in the object - libs.nnet3.train.common.CommonParser.parser. - See steps/libs/nnet3/train/common.py - """ - - parser = argparse.ArgumentParser( - description="""Trains RNN and DNN acoustic models using the 'chain' - objective function.""", - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - conflict_handler='resolve', - parents=[common_train_lib.CommonParser().parser]) - - # egs extraction options - parser.add_argument("--egs.chunk-width", type=str, dest='chunk_width', - default="20", - help="""Number of frames per chunk in the examples - used to train the RNN. Caution: if you double this you - should halve --trainer.samples-per-iter. May be - a comma-separated list of alternatives: first width - is the 'principal' chunk-width, used preferentially""") - - # chain options - parser.add_argument("--chain.lm-opts", type=str, dest='lm_opts', - default=None, action=common_lib.NullstrToNoneAction, - help="options to be be passed to chain-est-phone-lm") - parser.add_argument("--chain.alignments-for-lm", type=str, - dest='alignments_for_lm', default=None, - action=common_lib.NullstrToNoneAction, - help="""Comma-separated list of alignment Directories - containing ali.*.gz and their integer-valued weights - (separated with colon), used to - generate weighted phone language model for - denominator FST. The phone sets should be similar - for all alignment dirs. - If empty, alignments in tree-dir used for phone LM - generation. - e.g. "src1/ali_dir:10.0,src2/ali_dir:2.0" - """) - parser.add_argument("--chain.l2-regularize", type=float, - dest='l2_regularize', default=0.0, - help="""Weight of regularization function which is the - l2-norm of the output of the network. It should be used - without the log-softmax layer for the outputs. As - l2-norm of the log-softmax outputs can dominate the - objective function.""") - parser.add_argument("--chain.xent-regularize", type=float, - dest='xent_regularize', default=0.0, - help="Weight of regularization function which is the " - "cross-entropy cost the outputs.") - parser.add_argument("--chain.right-tolerance", type=int, - dest='right_tolerance', default=5, help="") - parser.add_argument("--chain.left-tolerance", type=int, - dest='left_tolerance', default=5, help="") - parser.add_argument("--chain.leaky-hmm-coefficient", type=float, - dest='leaky_hmm_coefficient', default=0.00001, - help="") - parser.add_argument("--chain.apply-deriv-weights", type=str, - dest='apply_deriv_weights', default=True, - action=common_lib.StrToBoolAction, - choices=["true", "false"], - help="") - parser.add_argument("--chain.frame-subsampling-factor", type=int, - dest='frame_subsampling_factor', default=3, - help="ratio of frames-per-second of features we " - "train on, to chain model's output") - parser.add_argument("--chain.alignment-subsampling-factor", type=int, - dest='alignment_subsampling_factor', - default=3, - help="ratio of frames-per-second of input " - "alignments to chain model's output") - parser.add_argument("--chain.left-deriv-truncate", type=int, - dest='left_deriv_truncate', - default=None, - help="Deprecated. Kept for back compatibility") - - # trainer options - parser.add_argument("--trainer.input-model", type=str, - dest='input_mdl', default=None, - action=common_lib.NullstrToNoneAction, - help="If specified, this model is used as 0.raw model " - " and no LDA matrix or init.raw initialzed.") - parser.add_argument("--trainer.num-epochs", type=float, dest='num_epochs', - default=10.0, - help="Number of epochs to train the model") - parser.add_argument("--trainer.frames-per-iter", type=int, - dest='frames_per_iter', default=800000, - help="""Each iteration of training, see this many - [input] frames per job. This option is passed to - get_egs.sh. Aim for about a minute of training - time""") - - parser.add_argument("--trainer.num-chunk-per-minibatch", type=str, - dest='num_chunk_per_minibatch', default='128', - help="""Number of sequences to be processed in - parallel every minibatch. May be a more general - rule as accepted by the --minibatch-size option of - nnet3-merge-egs; run that program without args to see - the format.""") - - # Parameters for the optimization - parser.add_argument("--trainer.optimization.initial-effective-lrate", - type=float, dest='initial_effective_lrate', - default=0.0002, - help="Learning rate used during the initial iteration") - parser.add_argument("--trainer.optimization.final-effective-lrate", - type=float, dest='final_effective_lrate', - default=0.00002, - help="Learning rate used during the final iteration") - parser.add_argument("--trainer.optimization.shrink-value", type=float, - dest='shrink_value', default=1.0, - help="""Scaling factor used for scaling the parameter - matrices when the derivative averages are below the - shrink-threshold at the non-linearities. E.g. 0.99. - Only applicable when the neural net contains sigmoid or - tanh units.""") - parser.add_argument("--trainer.optimization.shrink-saturation-threshold", - type=float, - dest='shrink_saturation_threshold', default=0.40, - help="""Threshold that controls when we apply the - 'shrinkage' (i.e. scaling by shrink-value). If the - saturation of the sigmoid and tanh nonlinearities in - the neural net (as measured by - steps/nnet3/get_saturation.pl) exceeds this threshold - we scale the parameter matrices with the - shrink-value.""") - parser.add_argument("--trainer.optimization.proportional-shrink", type=float, - dest='proportional_shrink', default=0.0, - help="""If nonzero, this will set a shrinkage (scaling) - factor for the parameters, whose value is set as: - shrink-value=(1.0 - proportional-shrink * learning-rate), where - 'learning-rate' is the learning rate being applied - on the current iteration, which will vary from - initial-effective-lrate*num-jobs-initial to - final-effective-lrate*num-jobs-final. - Unlike for train_rnn.py, this is applied unconditionally, - it does not depend on saturation of nonlinearities. - Can be used to roughly approximate l2 regularization.""") - - # RNN-specific training options - parser.add_argument("--trainer.deriv-truncate-margin", type=int, - dest='deriv_truncate_margin', default=None, - help="""(Relevant only for recurrent models). If - specified, gives the margin (in input frames) around - the 'required' part of each chunk that the derivatives - are backpropagated to. If unset, the derivatives are - backpropagated all the way to the boundaries of the - input data. E.g. 8 is a reasonable setting. Note: the - 'required' part of the chunk is defined by the model's - {left,right}-context.""") - # General options - parser.add_argument("--feat-dir", type=str, required=True, - help="Directory with features used for training " - "the neural network.") - parser.add_argument("--tree-dir", type=str, required=True, - help="""Directory containing the tree to use for this - model (we also expect final.mdl and ali.*.gz in that - directory""") - parser.add_argument("--lat-dir", type=str, required=True, - help="Directory with numerator lattices " - "used for training the neural network.") - parser.add_argument("--dir", type=str, required=True, - help="Directory to store the models and " - "all other files.") - - print(' '.join(sys.argv)) - print(sys.argv) - - args = parser.parse_args() - - [args, run_opts] = process_args(args) - - return [args, run_opts] - - -def process_args(args): - """ Process the options got from get_args() - """ - - if not common_train_lib.validate_chunk_width(args.chunk_width): - raise Exception("--egs.chunk-width has an invalid value"); - - if not common_train_lib.validate_minibatch_size_str(args.num_chunk_per_minibatch): - raise Exception("--trainer.num-chunk-per-minibatch has an invalid value"); - - if args.chunk_left_context < 0: - raise Exception("--egs.chunk-left-context should be non-negative") - - if args.chunk_right_context < 0: - raise Exception("--egs.chunk-right-context should be non-negative") - - if args.left_deriv_truncate is not None: - args.deriv_truncate_margin = -args.left_deriv_truncate - logger.warning( - "--chain.left-deriv-truncate (deprecated) is set by user, and " - "--trainer.deriv-truncate-margin is set to negative of that " - "value={0}. We recommend using the option " - "--trainer.deriv-truncate-margin.".format( - args.deriv_truncate_margin)) - - if (not os.path.exists(args.dir) - or not os.path.exists(args.dir+"/configs")): - raise Exception("This scripts expects {0} to exist and have a configs " - "directory which is the output of " - "make_configs.py script") - - if args.transform_dir is None: - args.transform_dir = args.lat_dir - # set the options corresponding to args.use_gpu - run_opts = common_train_lib.RunOpts() - if args.use_gpu: - if not common_lib.check_if_cuda_compiled(): - logger.warning( - """You are running with one thread but you have not compiled - for CUDA. You may be running a setup optimized for GPUs. - If you have GPUs and have nvcc installed, go to src/ and do - ./configure; make""") - - run_opts.train_queue_opt = "--gpu 1" - run_opts.parallel_train_opts = "" - run_opts.combine_queue_opt = "--gpu 1" - - else: - logger.warning("Without using a GPU this will be very slow. " - "nnet3 does not yet support multiple threads.") - - run_opts.train_queue_opt = "" - run_opts.parallel_train_opts = "--use-gpu=no" - run_opts.combine_queue_opt = "" - - run_opts.command = args.command - run_opts.egs_command = (args.egs_command - if args.egs_command is not None else - args.command) - - return [args, run_opts] - - -def train(args, run_opts): - """ The main function for training. - - Args: - args: a Namespace object with the required parameters - obtained from the function process_args() - run_opts: RunOpts object obtained from the process_args() - """ - - arg_string = pprint.pformat(vars(args)) - logger.info("Arguments for the experiment\n{0}".format(arg_string)) - - # Check files - chain_lib.check_for_required_files(args.feat_dir, args.tree_dir, - args.lat_dir) - - # Set some variables. - num_jobs = common_lib.get_number_of_jobs(args.lat_dir) - feat_dim = common_lib.get_feat_dim(args.feat_dir) - ivector_dim = common_lib.get_ivector_dim(args.online_ivector_dir) - ivector_id = common_lib.get_ivector_extractor_id(args.online_ivector_dir) - - # split the training data into parts for individual jobs - # we will use the same number of jobs as that used for alignment - common_lib.execute_command("utils/split_data.sh {0} {1}".format( - args.feat_dir, num_jobs)) - shutil.copy('{0}/tree'.format(args.tree_dir), args.dir) - with open('{0}/num_jobs'.format(args.dir), 'w') as f: - f.write(str(num_jobs)) - - config_dir = '{0}/configs'.format(args.dir) - var_file = '{0}/vars'.format(config_dir) - - variables = common_train_lib.parse_generic_config_vars_file(var_file) - - # Set some variables. - try: - model_left_context = variables['model_left_context'] - model_right_context = variables['model_right_context'] - except KeyError as e: - raise Exception("KeyError {0}: Variables need to be defined in " - "{1}".format(str(e), '{0}/configs'.format(args.dir))) - - left_context = args.chunk_left_context + model_left_context - right_context = args.chunk_right_context + model_right_context - left_context_initial = (args.chunk_left_context_initial + model_left_context if - args.chunk_left_context_initial >= 0 else -1) - right_context_final = (args.chunk_right_context_final + model_right_context if - args.chunk_right_context_final >= 0 else -1) - - # Initialize as "raw" nnet, prior to training the LDA-like preconditioning - # matrix. This first config just does any initial splicing that we do; - # we do this as it's a convenient way to get the stats for the 'lda-like' - # transform. - if (args.stage <= -6): - logger.info("Creating phone language-model") - chain_lib.create_phone_lm(args.dir, args.tree_dir, run_opts, - alignment_dirs=args.alignments_for_lm, - lm_opts=args.lm_opts) - - if (args.stage <= -5): - logger.info("Creating denominator FST") - chain_lib.create_denominator_fst(args.dir, args.tree_dir, run_opts) - - if (args.stage <= -4) and os.path.exists(args.dir+"/configs/init.config"): - logger.info("Initializing a basic network for estimating " - "preconditioning matrix") - common_lib.execute_command( - """{command} {dir}/log/nnet_init.log \ - nnet3-init --srand=-2 {dir}/configs/init.config \ - {dir}/init.raw""".format(command=run_opts.command, - dir=args.dir)) - - egs_left_context = left_context + args.frame_subsampling_factor / 2 - egs_right_context = right_context + args.frame_subsampling_factor / 2 - # note: the '+ args.frame_subsampling_factor / 2' is to allow for the - # fact that we'll be shifting the data slightly during training to give - # variety to the training data. - egs_left_context_initial = (left_context_initial + args.frame_subsampling_factor / 2 if - left_context_initial >= 0 else -1) - egs_right_context_final = (right_context_final + args.frame_subsampling_factor / 2 if - right_context_final >= 0 else -1) - - default_egs_dir = '{0}/egs'.format(args.dir) - if (args.stage <= -3) and args.egs_dir is None: - logger.info("Generating egs") - # this is where get_egs.sh is called. - chain_lib.generate_chain_egs( - dir=args.dir, data=args.feat_dir, - lat_dir=args.lat_dir, egs_dir=default_egs_dir, - left_context=egs_left_context, - right_context=egs_right_context, - left_context_initial=egs_left_context_initial, - right_context_final=egs_right_context_final, - run_opts=run_opts, - left_tolerance=args.left_tolerance, - right_tolerance=args.right_tolerance, - frame_subsampling_factor=args.frame_subsampling_factor, - alignment_subsampling_factor=args.alignment_subsampling_factor, - frames_per_eg_str=args.chunk_width, - srand=args.srand, - egs_opts=args.egs_opts, - cmvn_opts=args.cmvn_opts, - online_ivector_dir=args.online_ivector_dir, - frames_per_iter=args.frames_per_iter, - transform_dir=args.transform_dir, - stage=args.egs_stage) - - if args.egs_dir is None: - egs_dir = default_egs_dir - else: - egs_dir = args.egs_dir - - [egs_left_context, egs_right_context, - frames_per_eg_str, num_archives] = ( - common_train_lib.verify_egs_dir(egs_dir, feat_dim, - ivector_dim, ivector_id, - egs_left_context, egs_right_context, - egs_left_context_initial, - egs_right_context_final)) - - # check the cegs.*.ark is newer than phone LM "phone_lm.fst". - assert(os.path.getctime('{0}/cegs.1.ark'.format(egs_dir)) > - os.path.getctime('{0}/phone_lm.fst'.format(args.dir))) - assert(args.chunk_width == frames_per_eg_str) - num_archives_expanded = num_archives * args.frame_subsampling_factor - - if (args.num_jobs_final > num_archives_expanded): - raise Exception('num_jobs_final cannot exceed the ' - 'expanded number of archives') - - # copy the properties of the egs to dir for - # use during decoding - logger.info("Copying the properties from {0} to {1}".format(egs_dir, args.dir)) - common_train_lib.copy_egs_properties_to_exp_dir(egs_dir, args.dir) - - if (args.stage <= -2) and os.path.exists(args.dir+"/configs/init.config"): - logger.info('Computing the preconditioning matrix for input features') - - chain_lib.compute_preconditioning_matrix( - args.dir, egs_dir, num_archives, run_opts, - max_lda_jobs=args.max_lda_jobs, - rand_prune=args.rand_prune) - - if (args.stage <= -1): - logger.info("Preparing the initial acoustic model.") - chain_lib.prepare_initial_acoustic_model(args.dir, run_opts) - - with open("{0}/frame_subsampling_factor".format(args.dir), "w") as f: - f.write(str(args.frame_subsampling_factor)) - - # set num_iters so that as close as possible, we process the data - # $num_epochs times, i.e. $num_iters*$avg_num_jobs) == - # $num_epochs*$num_archives, where - # avg_num_jobs=(num_jobs_initial+num_jobs_final)/2. - num_archives_to_process = int(args.num_epochs * num_archives_expanded) - num_archives_processed = 0 - num_iters = ((num_archives_to_process * 2) - / (args.num_jobs_initial + args.num_jobs_final)) - - models_to_combine = common_train_lib.get_model_combine_iters( - num_iters, args.num_epochs, - num_archives_expanded, args.max_models_combine, - args.num_jobs_final) - - def learning_rate(iter, current_num_jobs, num_archives_processed): - return common_train_lib.get_learning_rate(iter, current_num_jobs, - num_iters, - num_archives_processed, - num_archives_to_process, - args.initial_effective_lrate, - args.final_effective_lrate) - - min_deriv_time = None - max_deriv_time_relative = None - if args.deriv_truncate_margin is not None: - min_deriv_time = -args.deriv_truncate_margin - model_left_context - max_deriv_time_relative = \ - args.deriv_truncate_margin + model_right_context - - logger.info("Training will run for {0} epochs = " - "{1} iterations".format(args.num_epochs, num_iters)) - - for iter in range(num_iters): - if (args.exit_stage is not None) and (iter == args.exit_stage): - logger.info("Exiting early due to --exit-stage {0}".format(iter)) - return - current_num_jobs = int(0.5 + args.num_jobs_initial - + (args.num_jobs_final - args.num_jobs_initial) - * float(iter) / num_iters) - - if args.stage <= iter: - model_file = "{dir}/{iter}.mdl".format(dir=args.dir, iter=iter) - - lrate = learning_rate(iter, current_num_jobs, - num_archives_processed) - shrink_value = 1.0 - if args.proportional_shrink != 0.0: - shrink_value = 1.0 - (args.proportional_shrink * lrate) - if shrink_value <= 0.5: - raise Exception("proportional-shrink={0} is too large, it gives " - "shrink-value={1}".format(args.proportional_shrink, - shrink_value)) - - if args.shrink_value < shrink_value: - shrink_value = (args.shrink_value - if common_train_lib.should_do_shrinkage( - iter, model_file, - args.shrink_saturation_threshold) - else shrink_value) - - chain_lib.train_one_iteration( - dir=args.dir, - iter=iter, - srand=args.srand, - egs_dir=egs_dir, - num_jobs=current_num_jobs, - num_archives_processed=num_archives_processed, - num_archives=num_archives, - learning_rate=lrate, - dropout_edit_string=common_train_lib.get_dropout_edit_string( - args.dropout_schedule, - float(num_archives_processed) / num_archives_to_process, - iter), - shrinkage_value=shrink_value, - num_chunk_per_minibatch_str=args.num_chunk_per_minibatch, - apply_deriv_weights=args.apply_deriv_weights, - min_deriv_time=min_deriv_time, - max_deriv_time_relative=max_deriv_time_relative, - l2_regularize=args.l2_regularize, - xent_regularize=args.xent_regularize, - leaky_hmm_coefficient=args.leaky_hmm_coefficient, - momentum=args.momentum, - max_param_change=args.max_param_change, - shuffle_buffer_size=args.shuffle_buffer_size, - frame_subsampling_factor=args.frame_subsampling_factor, - run_opts=run_opts) - - if args.cleanup: - # do a clean up everythin but the last 2 models, under certain - # conditions - common_train_lib.remove_model( - args.dir, iter-2, num_iters, models_to_combine, - args.preserve_model_interval) - - if args.email is not None: - reporting_iter_interval = num_iters * args.reporting_interval - if iter % reporting_iter_interval == 0: - # lets do some reporting - [report, times, data] = ( - nnet3_log_parse.generate_acc_logprob_report( - args.dir, "log-probability")) - message = report - subject = ("Update : Expt {dir} : " - "Iter {iter}".format(dir=args.dir, iter=iter)) - common_lib.send_mail(message, subject, args.email) - - num_archives_processed = num_archives_processed + current_num_jobs - - if args.stage <= num_iters: - logger.info("Doing final combination to produce final.mdl") - chain_lib.combine_models( - dir=args.dir, num_iters=num_iters, - models_to_combine=models_to_combine, - num_chunk_per_minibatch_str=args.num_chunk_per_minibatch, - egs_dir=egs_dir, - leaky_hmm_coefficient=args.leaky_hmm_coefficient, - l2_regularize=args.l2_regularize, - xent_regularize=args.xent_regularize, - run_opts=run_opts, - sum_to_one_penalty=args.combine_sum_to_one_penalty) - - - if args.cleanup: - logger.info("Cleaning up the experiment directory " - "{0}".format(args.dir)) - remove_egs = args.remove_egs - if args.egs_dir is not None: - # this egs_dir was not created by this experiment so we will not - # delete it - remove_egs = False - - common_train_lib.clean_nnet_dir( - args.dir, num_iters, egs_dir, - preserve_model_interval=args.preserve_model_interval, - remove_egs=remove_egs) - - # do some reporting - [report, times, data] = nnet3_log_parse.generate_acc_logprob_report( - args.dir, "log-probability") - if args.email is not None: - common_lib.send_mail(report, "Update : Expt {0} : " - "complete".format(args.dir), args.email) - - with open("{dir}/accuracy.report".format(dir=args.dir), "w") as f: - f.write(report) - - common_lib.execute_command("steps/info/nnet3_dir_info.pl " - "{0}".format(args.dir)) - - -def main(): - [args, run_opts] = get_args() - try: - train(args, run_opts) - common_lib.wait_for_background_commands() - except BaseException as e: - # look for BaseException so we catch KeyboardInterrupt, which is - # what we get when a background thread dies. - if args.email is not None: - message = ("Training session for experiment {dir} " - "died due to an error.".format(dir=args.dir)) - common_lib.send_mail(message, message, args.email) - if not isinstance(e, KeyboardInterrupt): - traceback.print_exc() - sys.exit(1) - -if __name__ == "__main__": - main() diff --git a/egs/wsj/s5/steps/nnet3/xconfig_to_configs.py b/egs/wsj/s5/steps/nnet3/xconfig_to_configs.py index e0592aa41c6..8204ec63eaf 100755 --- a/egs/wsj/s5/steps/nnet3/xconfig_to_configs.py +++ b/egs/wsj/s5/steps/nnet3/xconfig_to_configs.py @@ -31,25 +31,26 @@ def get_args(): parser.add_argument('--existing-model', help='Filename of previously trained neural net ' '(e.g. final.mdl) which is useful in case of ' - 'using list of component-nodes in already trained model ' + 'using nodes from list of component-nodes in ' + 'already trained model ' 'to generate new config file for new model.' + 'The context info is also generated using ' + 'final.config added to existing model.' 'e.g. In Transfer learning: generate new model using ' 'nodes in existing model.') parser.add_argument('--config-dir', required=True, help='Directory to write config files and variables') - parser.add_argument('--edits-config', type=str, default=None, + parser.add_argument('--nnet-edits', type=str, default=None, action=common_lib.NullstrToNoneAction, - help="This is nnet3 config filename that is useful in " - "case the network you are " - "creating does not have an output node called 'output' " - "(e.g. for multilingual setups). You can set this to " - "an edits-config can contain string like: " - "'rename-node old-name=xxx new-name=output' " - "if node xxx plays the role of the output node in this " - "network." - "This is only used for computing the left/right context.") - - print(' '.join(sys.argv)) + help="""This option is useful in case the network you + are creating does not have an output node called + 'output' (e.g. for multilingual setups). You can set + this to an edit-string like: 'rename-node old-name=xxx + new-name=output' if node xxx plays the role of the + output node in this network. This is only used for + computing the left/right context.""") + + print(' '.join(sys.argv), file=sys.stderr) args = parser.parse_args() args = check_args(args) @@ -227,19 +228,20 @@ def write_config_files(config_dir, all_layers): raise -def add_nnet_context_info(config_dir, existing_model=None, - edits_config=None): - """This will be removed when python script refactoring is done.""" - model = "{0}/ref.raw".format(config_dir) - if edits_config is not None: - model = """ - | nnet3-copy --edits-config={0} - {1}""".format(edits_config, - model) - common_lib.execute_command("""nnet3-init {0} {1}/ref.config """ - """ {2} """.format(existing_model if +def add_nnet_context_info(config_dir, nnet_edits=None, + existing_model=None): + """Create the 'vars' file that specifies model_left_context, etc.""" + + common_lib.execute_command("nnet3-init {0} {1}/ref.config " + "{1}/ref.raw".format(existing_model if existing_model is not None else "", - config_dir, model)) - out = common_lib.get_command_stdout("""nnet3-info {0}/ref.raw | """ - """head -4""".format(config_dir)) + config_dir)) + model = "{0}/ref.raw".format(config_dir) + if nnet_edits is not None: + model = "nnet3-copy --edits='{0}' {1} - |".format(nnet_edits, + model) + out = common_lib.get_command_stdout('nnet3-info "{0}" | head -n 4 ' + .format(model)) # out looks like this # left-context: 7 # right-context: 0 @@ -260,22 +262,21 @@ def add_nnet_context_info(config_dir, existing_model=None, vf.write('model_right_context={0}\n'.format(info['right-context'])) vf.close() -def check_model_contexts(config_dir, existing_model=None, edits_config=None): +def check_model_contexts(config_dir, nnet_edits=None, existing_model=None): contexts = {} for file_name in ['init', 'ref']: if os.path.exists('{0}/{1}.config'.format(config_dir, file_name)): contexts[file_name] = {} + common_lib.execute_command("nnet3-init {0} {1}/{2}.config " + "{1}/{2}.raw".format(existing_model if + existing_model is not None else '', + config_dir, file_name)) model = "{0}/{1}.raw".format(config_dir, file_name) - if edits_config is not None: - model = """ - | nnet3-copy --edits-config={0} - {1}""".format(edits_config, - model) - common_lib.execute_command("""nnet3-init {0} {1}/{2}.config """ - """ {3} """.format(existing_model if - existing_model is not - None else "", config_dir, - file_name, model)) - out = common_lib.get_command_stdout("""nnet3-info {0}/{1}.raw | """ - """head -4""".format(config_dir, file_name)) + if nnet_edits is not None: + model = "nnet3-copy --edits='{0}' {1} - |".format(nnet_edits, + model) + out = common_lib.get_command_stdout('nnet3-info "{0}" | head -n 4 ' + .format(model)) # out looks like this # left-context: 7 # right-context: 0 @@ -294,14 +295,17 @@ def check_model_contexts(config_dir, existing_model=None, edits_config=None): assert(contexts.has_key('ref')) if (contexts['init'].has_key('left-context') and contexts['ref'].has_key('left-context')): - if ((contexts['init']['left-context'] > contexts['ref']['left-context']) - or (contexts['init']['right-context'] > contexts['ref']['right-context'])): - raise Exception("Model specified in {0}/init.config requires greater" - " context than the model specified in {0}/ref.config." - " This might be due to use of label-delay at the output" - " in ref.config. Please use delay=$label_delay in the" - " initial fixed-affine-layer of the network, to avoid" - " this issue.") + if ((contexts['init']['left-context'] + > contexts['ref']['left-context']) + or (contexts['init']['right-context'] + > contexts['ref']['right-context'])): + raise Exception( + "Model specified in {0}/init.config requires greater" + " context than the model specified in {0}/ref.config." + " This might be due to use of label-delay at the output" + " in ref.config. Please use delay=$label_delay in the" + " initial fixed-affine-layer of the network, to avoid" + " this issue.") @@ -311,12 +315,13 @@ def main(): aux_layers = [] if args.existing_model is not None: aux_layers = xparser.get_model_component_info(args.existing_model) - all_layers = xparser.read_xconfig_file(args.xconfig_file, aux_layers) + all_layers = xparser.read_xconfig_file(args.xconfig_file, aux_layers) write_expanded_xconfig_files(args.config_dir, all_layers) write_config_files(args.config_dir, all_layers) - check_model_contexts(args.config_dir, args.existing_model, args.edits_config) - add_nnet_context_info(args.config_dir, args.existing_model, - args.edits_config) + check_model_contexts(args.config_dir, args.nnet_edits, + existing_model=args.existing_model) + add_nnet_context_info(args.config_dir, args.nnet_edits, + existing_model=args.existing_model) if __name__ == '__main__': From 49bcf2e2d3bc2684c80a8f32e5764f57a8c712e9 Mon Sep 17 00:00:00 2001 From: Pegita Date: Thu, 10 Aug 2017 16:31:00 -0400 Subject: [PATCH 035/174] removed changes to language-model.* and generated weighted phone lm using repeating alignments. --- .../local/chain/tuning/run_tdnn_wsj_rm_1a.sh | 8 ----- .../local/chain/tuning/run_tdnn_wsj_rm_1b.sh | 8 ----- .../local/chain/tuning/run_tdnn_wsj_rm_1c.sh | 8 ----- .../nnet3/chain/make_weighted_den_fst.sh | 13 ++++++-- src/chain/language-model.cc | 11 +++---- src/chain/language-model.h | 6 ++-- src/chainbin/chain-est-phone-lm.cc | 32 ++++++------------- 7 files changed, 28 insertions(+), 58 deletions(-) diff --git a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1a.sh b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1a.sh index 23a3b91fbfe..2c8be9428bb 100755 --- a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1a.sh +++ b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1a.sh @@ -183,13 +183,5 @@ if [ $stage -le 10 ]; then --online-ivector-dir exp/nnet2${nnet_affix}/ivectors_test \ $dir/graph data/test_hires $dir/decode || exit 1; fi - -if [ $stage -le 11 ]; then - utils/mkgraph.sh --self-loop-scale 1.0 data/lang_ug $dir $dir/graph_ug - steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ - --nj 20 --cmd "$decode_cmd" \ - --online-ivector-dir exp/nnet2${nnet_affix}/ivectors_test \ - $dir/graph_ug data/test_hires $dir/decode_ug || exit 1; -fi wait; exit 0; diff --git a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1b.sh b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1b.sh index 2aff104a22a..de375f91a48 100755 --- a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1b.sh +++ b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1b.sh @@ -196,13 +196,5 @@ if [ $stage -le 10 ]; then --online-ivector-dir exp/nnet2${nnet_affix}/ivectors_test \ $dir/graph data/test_hires $dir/decode || exit 1; fi - -if [ $stage -le 11 ]; then - utils/mkgraph.sh --self-loop-scale 1.0 $lang_ug_dir $dir $dir/graph_ug - steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ - --nj 20 --cmd "$decode_cmd" \ - --online-ivector-dir exp/nnet2${nnet_affix}/ivectors_test \ - $dir/graph_ug data/test_hires $dir/decode_ug || exit 1; -fi wait; exit 0; diff --git a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1c.sh b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1c.sh index b4063cfe014..1c3e0f073e2 100755 --- a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1c.sh +++ b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1c.sh @@ -199,13 +199,5 @@ if [ $stage -le 10 ]; then --online-ivector-dir exp/nnet2${nnet_affix}/ivectors_test \ $dir/graph data/test_hires $dir/decode || exit 1; fi - -if [ $stage -le 11 ]; then - utils/mkgraph.sh --self-loop-scale 1.0 ${lang_src_tgt}_ug $dir $dir/graph_ug - steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ - --nj 20 --cmd "$decode_cmd" \ - --online-ivector-dir exp/nnet2${nnet_affix}/ivectors_test \ - $dir/graph_ug data/test_hires $dir/decode_ug || exit 1; -fi wait; exit 0; diff --git a/egs/wsj/s5/steps/nnet3/chain/make_weighted_den_fst.sh b/egs/wsj/s5/steps/nnet3/chain/make_weighted_den_fst.sh index 67d6eefce5b..b01db3cab15 100755 --- a/egs/wsj/s5/steps/nnet3/chain/make_weighted_den_fst.sh +++ b/egs/wsj/s5/steps/nnet3/chain/make_weighted_den_fst.sh @@ -66,14 +66,23 @@ done cp ${ali_dirs[0]}/tree $dir/ || exit 1 + for n in `seq 0 $[num_alignments-1]`; do adir=${ali_dirs[$n]} - alignments+=("ark:gunzip -c $adir/ali.*.gz | ali-to-phones $adir/final.mdl ark:- ark:- |") + w=`echo $weights | cut -d, -f$[$n+1]` + if ! [[ $w =~ ^[+]?[0-9]+$ ]]; then + echo "no positive integer weight specified for alignment $adir" && exit 1; + fi + repeated_ali_to_process="" + for x in `seq $w`;do + repeated_ali_to_process="ark:gunzip -c $adir/ali.*.gz $repeated_ali_to_process" + done + alignments+=("$repeated_ali_to_process | ali-to-phones $adir/final.mdl ark:- ark:- |") done if [ $stage -le 1 ]; then $cmd $dir/log/make_phone_lm.log \ - chain-est-phone-lm $lm_opts --scales="$weights" \ + chain-est-phone-lm $lm_opts \ "${alignments[@]}" $dir/phone_lm.fst || exit 1 fi diff --git a/src/chain/language-model.cc b/src/chain/language-model.cc index 892ab30958c..41e06116ea8 100644 --- a/src/chain/language-model.cc +++ b/src/chain/language-model.cc @@ -26,8 +26,7 @@ namespace kaldi { namespace chain { -void LanguageModelEstimator::AddCounts(const std::vector &sentence, - int32 weight) { +void LanguageModelEstimator::AddCounts(const std::vector &sentence) { KALDI_ASSERT(opts_.ngram_order >= 2 && "--ngram-order must be >= 2"); KALDI_ASSERT(opts_.ngram_order >= opts_.no_prune_ngram_order); int32 order = opts_.ngram_order; @@ -37,23 +36,23 @@ void LanguageModelEstimator::AddCounts(const std::vector &sentence, end = sentence.end(); for (; iter != end; ++iter) { KALDI_ASSERT(*iter != 0); - IncrementCount(history, *iter, weight); + IncrementCount(history, *iter); history.push_back(*iter); if (history.size() >= order) history.erase(history.begin()); } // Probability of end of sentence. This will end up getting ignored later, but // it still makes a difference for probability-normalization reasons. - IncrementCount(history, 0, weight); + IncrementCount(history, 0); } void LanguageModelEstimator::IncrementCount(const std::vector &history, - int32 next_phone, int32 weight) { + int32 next_phone) { int32 lm_state_index = FindOrCreateLmStateIndexForHistory(history); if (lm_states_[lm_state_index].tot_count == 0) { num_active_lm_states_++; } - lm_states_[lm_state_index].AddCount(next_phone, weight); + lm_states_[lm_state_index].AddCount(next_phone, 1); } void LanguageModelEstimator::SetParentCounts() { diff --git a/src/chain/language-model.h b/src/chain/language-model.h index 1109d832bf7..b2c3f4cd746 100644 --- a/src/chain/language-model.h +++ b/src/chain/language-model.h @@ -89,9 +89,9 @@ class LanguageModelEstimator { } // Adds counts for this sentence. Basically does: for each n-gram in the - // sentence, count[n-gram] += weight. The only constraint on 'sentence' is that it + // sentence, count[n-gram] += 1. The only constraint on 'sentence' is that it // should contain no zeros. - void AddCounts(const std::vector &sentence, int32 weight = 1.0); + void AddCounts(const std::vector &sentence); // Estimates the LM and outputs it as an FST. Note: there is // no concept here of backoff arcs. @@ -188,7 +188,7 @@ class LanguageModelEstimator { // adds the counts for this ngram (called from AddCounts()). inline void IncrementCount(const std::vector &history, - int32 next_phone, int32 weight); + int32 next_phone); // Computes whether backoff should be allowed for this lm_state. (the caller diff --git a/src/chainbin/chain-est-phone-lm.cc b/src/chainbin/chain-est-phone-lm.cc index 3e82417c315..f16b3f4f14b 100644 --- a/src/chainbin/chain-est-phone-lm.cc +++ b/src/chainbin/chain-est-phone-lm.cc @@ -32,7 +32,7 @@ int main(int argc, char *argv[]) { "Initialize un-smoothed phone language model for 'chain' training\n" "Output in FST format (epsilon-free deterministic acceptor)\n" "\n" - "Usage: chain-est-phone-lm [options] [... ] \n" + "Usage: chain-est-phone-lm [options] \n" "The phone-sequences are used to train a language model.\n" "e.g.:\n" "gunzip -c input_dir/ali.*.gz | ali-to-phones input_dir/final.mdl ark:- ark:- | \\\n" @@ -40,43 +40,29 @@ int main(int argc, char *argv[]) { bool binary_write = true; LanguageModelOptions lm_opts; - std::string scales_str; ParseOptions po(usage); po.Register("binary", &binary_write, "Write output in binary mode"); - po.Register("scales", &scales_str, "comma-separated list of integer valued scale weights used to scale different phone sequences."); lm_opts.Register(&po); po.Read(argc, argv); - if (po.NumArgs() < 2) { + if (po.NumArgs() != 2) { po.PrintUsage(); exit(1); } - int32 num_seqs = po.NumArgs() - 1; - std::vector scales(num_seqs, 1); - if (!scales_str.empty()) { - SplitStringToIntegers(scales_str, ",", false, &scales); - if (scales.size() != num_seqs) - KALDI_ERR << "--scales should have exactly " - << num_seqs << " scales."; - } - std::string lm_fst_wxfilename = po.GetArg(po.NumArgs()); + std::string phone_seqs_rspecifier = po.GetArg(1), + lm_fst_wxfilename = po.GetArg(2); LanguageModelEstimator lm_estimator(lm_opts); - for (int i = 1; i <= num_seqs; i++) { - std::string phone_seqs_rspecifier = po.GetArg(i); - SequentialInt32VectorReader phones_reader(phone_seqs_rspecifier); - KALDI_LOG << "Reading phone sequences"; - for (; !phones_reader.Done(); phones_reader.Next()) { - if (scales[i-1] != 0) { - const std::vector &phone_seq = phones_reader.Value(); - lm_estimator.AddCounts(phone_seq, scales[i-1]); - } - } + SequentialInt32VectorReader phones_reader(phone_seqs_rspecifier); + KALDI_LOG << "Reading phone sequences"; + for (; !phones_reader.Done(); phones_reader.Next()) { + const std::vector &phone_seq = phones_reader.Value(); + lm_estimator.AddCounts(phone_seq); } KALDI_LOG << "Estimating phone LM"; fst::StdVectorFst fst; From d25e63a1c1d5e425efc45e30427243caf066c73c Mon Sep 17 00:00:00 2001 From: Pegita Date: Thu, 10 Aug 2017 17:53:25 -0400 Subject: [PATCH 036/174] optimized alignment processing stage in weighted phone lm generation. --- .../s5/local/nnet3/run_tdnn_multilingual.sh | 9 +----- .../local/chain/tuning/run_tdnn_wsj_rm_1a.sh | 2 -- .../local/chain/tuning/run_tdnn_wsj_rm_1b.sh | 3 +- .../local/chain/tuning/run_tdnn_wsj_rm_1c.sh | 8 +++++ .../nnet3/chain/make_weighted_den_fst.sh | 30 ++++++++----------- 5 files changed, 22 insertions(+), 30 deletions(-) diff --git a/egs/babel_multilang/s5/local/nnet3/run_tdnn_multilingual.sh b/egs/babel_multilang/s5/local/nnet3/run_tdnn_multilingual.sh index 70ac89cf61e..6504f98c591 100755 --- a/egs/babel_multilang/s5/local/nnet3/run_tdnn_multilingual.sh +++ b/egs/babel_multilang/s5/local/nnet3/run_tdnn_multilingual.sh @@ -204,16 +204,9 @@ EOF echo " output-layer name=output-${lang_index} dim=$num_targets max-change=1.5" done >> $dir/configs/network.xconfig - cat < $dir/configs/edits.config - rename-node old-name=output-0 new-name=output -EOF steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig \ --config-dir $dir/configs/ \ - --edits-config=$dir/configs/edits.config - - cat <> $dir/configs/vars -include_log_softmax=false -EOF + --nnet-edits="rename-node old-name=output-0 new-name=output" fi if [ $stage -le 9 ]; then diff --git a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1a.sh b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1a.sh index 2c8be9428bb..0781cb8c3d3 100755 --- a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1a.sh +++ b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1a.sh @@ -32,7 +32,6 @@ xent_regularize=0.1 # configs for transfer learning srcdir=../../wsj/s5/ common_egs_dir= -#common_egs_dir=exp/chain/tdnn_wsj_rm_1c_fixed_ac_scale/egs src_mdl=$srcdir/exp/chain/tdnn1d_sp/final.mdl primary_lr_factor=0.25 # The learning-rate factor for transferred layers from source # model. @@ -55,7 +54,6 @@ EOF fi required_files="$src_mdl $srcdir/exp/nnet3/extractor/final.mdl" - for f in $required_files; do if [ ! -f $f ]; then echo "$0: no such file $f" diff --git a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1b.sh b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1b.sh index de375f91a48..07ce76c82f2 100755 --- a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1b.sh +++ b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1b.sh @@ -43,8 +43,7 @@ xent_regularize=0.1 # configs for transfer learning common_egs_dir= -#srcdir=../../wsj/s5/ -srcdir=/export/a09/pegahgh/kaldi-transfer-learning/egs/wsj/s5-sp +srcdir=../../wsj/s5/ src_mdl=$srcdir/exp/chain/tdnn1d_sp/final.mdl src_lang=$srcdir/data/lang src_gmm_dir=$srcdir/exp/tri4b diff --git a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1c.sh b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1c.sh index 1c3e0f073e2..89d76a3f8d0 100755 --- a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1c.sh +++ b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1c.sh @@ -87,6 +87,14 @@ ali_dir=exp/chain/chain_ali_wsj treedir=exp/chain/tri4_5n_tree_wsj lat_dir=exp/chain_lats${src_tree_dir:+_wsj} +required_files="$src_mdl $src_lang/phones.txt $srcdir/data/local/dict_nosp/lexicon.txt $src_tree_dir/tree" + +for f in $required_files; do + if [ ! -f $f ]; then + echo "$0: no such file $f" + fi +done + if [ $stage -le -1 ]; then echo "$0: prepare lang for RM-WSJ using WSJ phone set and lexicon and RM word list." if ! cmp -s <(grep -v "^#" $src_lang/phones.txt) <(grep -v "^#" data/lang/phones.txt); then diff --git a/egs/wsj/s5/steps/nnet3/chain/make_weighted_den_fst.sh b/egs/wsj/s5/steps/nnet3/chain/make_weighted_den_fst.sh index b01db3cab15..6e97d6cdc9e 100755 --- a/egs/wsj/s5/steps/nnet3/chain/make_weighted_den_fst.sh +++ b/egs/wsj/s5/steps/nnet3/chain/make_weighted_den_fst.sh @@ -65,25 +65,19 @@ for n in `seq 0 $[$num_alignments-1]`;do done cp ${ali_dirs[0]}/tree $dir/ || exit 1 - - -for n in `seq 0 $[num_alignments-1]`; do - adir=${ali_dirs[$n]} - w=`echo $weights | cut -d, -f$[$n+1]` - if ! [[ $w =~ ^[+]?[0-9]+$ ]]; then - echo "no positive integer weight specified for alignment $adir" && exit 1; - fi - repeated_ali_to_process="" - for x in `seq $w`;do - repeated_ali_to_process="ark:gunzip -c $adir/ali.*.gz $repeated_ali_to_process" - done - alignments+=("$repeated_ali_to_process | ali-to-phones $adir/final.mdl ark:- ark:- |") -done - + #if ! [[ $w =~ ^[+]?[0-9]+$ ]] \; then + # echo "no positive integer weight specified for alignment $adir" && exit 1; + #fi if [ $stage -le 1 ]; then - $cmd $dir/log/make_phone_lm.log \ - chain-est-phone-lm $lm_opts \ - "${alignments[@]}" $dir/phone_lm.fst || exit 1 + $cmd $dir/log/make_phone_lm_fst.log \ + ali_dirs=\(${ali_dirs[@]}\) \; \ + for n in `seq 0 $[num_alignments-1]`\; do \ + adir=\${ali_dirs[\$n]} \; \ + w=\$\(echo $weights \| cut -d, -f\$[\$n+1]\) \; \ + for x in \$\(seq \$w\)\; do gunzip -c \$adir/ali.*.gz \; done \| \ + ali-to-phones \$adir/final.mdl ark:- ark:- \; \ + done \| \ + chain-est-phone-lm $lm_opts ark:- $dir/phone_lm.fst fi if [ $stage -le 2 ]; then From f2d01aea04402c16d79f340d3b3dae4bd219dff4 Mon Sep 17 00:00:00 2001 From: Pegita Date: Thu, 10 Aug 2017 18:02:05 -0400 Subject: [PATCH 037/174] added check to have possitive int as phone lm weights. --- egs/wsj/s5/steps/nnet3/chain/make_weighted_den_fst.sh | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/egs/wsj/s5/steps/nnet3/chain/make_weighted_den_fst.sh b/egs/wsj/s5/steps/nnet3/chain/make_weighted_den_fst.sh index 6e97d6cdc9e..cd284dbfe3e 100755 --- a/egs/wsj/s5/steps/nnet3/chain/make_weighted_den_fst.sh +++ b/egs/wsj/s5/steps/nnet3/chain/make_weighted_den_fst.sh @@ -65,10 +65,14 @@ for n in `seq 0 $[$num_alignments-1]`;do done cp ${ali_dirs[0]}/tree $dir/ || exit 1 - #if ! [[ $w =~ ^[+]?[0-9]+$ ]] \; then - # echo "no positive integer weight specified for alignment $adir" && exit 1; - #fi + if [ $stage -le 1 ]; then + for n in `seq 0 $[num_alignments-1]`; do + w=$(echo $weights | cut -d, -f$[$n+1]) + if ! [[ $w =~ ^[+]?[0-9]+$ ]] ; then + echo "no positive integer weight specified for alignment ${ali_dirs[$n]}" && exit 1; + fi + done $cmd $dir/log/make_phone_lm_fst.log \ ali_dirs=\(${ali_dirs[@]}\) \; \ for n in `seq 0 $[num_alignments-1]`\; do \ From 293c531b8f8eed381a16ba0e6fbc615818f8fbbd Mon Sep 17 00:00:00 2001 From: Pegita Date: Thu, 10 Aug 2017 18:12:38 -0400 Subject: [PATCH 038/174] fixed small issue with train_dnn.py. --- egs/wsj/s5/steps/nnet3/train_dnn.py | 65 ++++++++++++++++------------- 1 file changed, 37 insertions(+), 28 deletions(-) diff --git a/egs/wsj/s5/steps/nnet3/train_dnn.py b/egs/wsj/s5/steps/nnet3/train_dnn.py index 3b2c09a7fa9..87a1fd5afed 100755 --- a/egs/wsj/s5/steps/nnet3/train_dnn.py +++ b/egs/wsj/s5/steps/nnet3/train_dnn.py @@ -199,16 +199,14 @@ def train(args, run_opts): # we do this as it's a convenient way to get the stats for the 'lda-like' # transform. - if (args.stage <= -5): - if os.path.exists(args.dir+"/configs/init.config"): - logger.info("Initializing a basic network for estimating " - "preconditioning matrix") - common_lib.run_job( - """{command} {dir}/log/nnet_init.log \ - nnet3-init --srand=-2 {dir}/configs/init.config \ - {dir}/init.raw""".format(command=run_opts.command, - dir=args.dir)) - assert(os.path.exists(args.dir+"/init.raw")) + if (args.stage <= -5) and os.path.exists(args.dir+"/configs/init.config"): + logger.info("Initializing a basic network for estimating " + "preconditioning matrix") + common_lib.execute_command( + """{command} {dir}/log/nnet_init.log \ + nnet3-init --srand=-2 {dir}/configs/init.config \ + {dir}/init.raw""".format(command=run_opts.command, + dir=args.dir)) default_egs_dir = '{0}/egs'.format(args.dir) if (args.stage <= -4) and args.egs_dir is None: @@ -283,10 +281,15 @@ def train(args, run_opts): num_iters = ((num_archives_to_process * 2) / (args.num_jobs_initial + args.num_jobs_final)) - models_to_combine = common_train_lib.get_model_combine_iters( - num_iters, args.num_epochs, - num_archives_expanded, args.max_models_combine, - args.num_jobs_final) + # If do_final_combination is True, compute the set of models_to_combine. + # Otherwise, models_to_combine will be none. + if args.do_final_combination: + models_to_combine = common_train_lib.get_model_combine_iters( + num_iters, args.num_epochs, + num_archives_expanded, args.max_models_combine, + args.num_jobs_final) + else: + models_to_combine = None logger.info("Training will run for {0} epochs = " "{1} iterations".format(args.num_epochs, num_iters)) @@ -354,28 +357,34 @@ def train(args, run_opts): num_archives_processed = num_archives_processed + current_num_jobs if args.stage <= num_iters: - logger.info("Doing final combination to produce final.mdl") - train_lib.common.combine_models( - dir=args.dir, num_iters=num_iters, - models_to_combine=models_to_combine, - egs_dir=egs_dir, - minibatch_size_str=args.minibatch_size, run_opts=run_opts, - sum_to_one_penalty=args.combine_sum_to_one_penalty) - + if args.do_final_combination: + logger.info("Doing final combination to produce final.mdl") + train_lib.common.combine_models( + dir=args.dir, num_iters=num_iters, + models_to_combine=models_to_combine, + egs_dir=egs_dir, + minibatch_size_str=args.minibatch_size, run_opts=run_opts, + sum_to_one_penalty=args.combine_sum_to_one_penalty) + if args.stage <= num_iters + 1: logger.info("Getting average posterior for purposes of " "adjusting the priors.") + + # If args.do_final_combination is true, we will use the combined model. + # Otherwise, we will use the last_numbered model. + real_iter = 'combined' if args.do_final_combination else num_iters avg_post_vec_file = train_lib.common.compute_average_posterior( - dir=args.dir, iter='combined', egs_dir=egs_dir, - num_archives=num_archives, + dir=args.dir, iter=real_iter, + egs_dir=egs_dir, num_archives=num_archives, prior_subset_size=args.prior_subset_size, run_opts=run_opts) logger.info("Re-adjusting priors based on computed posteriors") - combined_model = "{dir}/combined.mdl".format(dir=args.dir) + combined_or_last_numbered_model = "{dir}/{iter}.mdl".format(dir=args.dir, + iter=real_iter) final_model = "{dir}/final.mdl".format(dir=args.dir) - train_lib.common.adjust_am_priors(args.dir, combined_model, - avg_post_vec_file, final_model, - run_opts) + train_lib.common.adjust_am_priors(args.dir, combined_or_last_numbered_model, + avg_post_vec_file, final_model, run_opts) + if args.cleanup: logger.info("Cleaning up the experiment directory " From 5b510f9ad38c3d837839e839056e490007d0bfae Mon Sep 17 00:00:00 2001 From: Pegita Date: Fri, 11 Aug 2017 13:12:08 -0400 Subject: [PATCH 039/174] fixed some issues. --- .../local/chain/tuning/run_tdnn_wsj_rm_1a.sh | 12 +- .../local/chain/tuning/run_tdnn_wsj_rm_1b.sh | 52 ++--- .../local/chain/tuning/run_tdnn_wsj_rm_1c.sh | 43 ++-- egs/rm/s5/local/prepare_wsj_rm_lang.sh | 2 +- .../nnet3/train/chain_objf/acoustic_model.py | 9 +- egs/wsj/s5/steps/libs/nnet3/train/common.py | 9 +- .../steps/libs/nnet3/xconfig/basic_layers.py | 15 +- egs/wsj/s5/steps/libs/nnet3/xconfig/utils.py | 2 +- egs/wsj/s5/steps/nnet3/align_lats.sh | 190 ++++++++++++++++++ .../nnet3/chain/make_weighted_den_fst.sh | 39 ++-- egs/wsj/s5/steps/nnet3/chain/train.py | 14 +- egs/wsj/s5/steps/nnet3/xconfig_to_configs.py | 14 +- 12 files changed, 309 insertions(+), 92 deletions(-) create mode 100755 egs/wsj/s5/steps/nnet3/align_lats.sh diff --git a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1a.sh b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1a.sh index 0781cb8c3d3..651f43093ae 100755 --- a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1a.sh +++ b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1a.sh @@ -30,9 +30,11 @@ remove_egs=false xent_regularize=0.1 # configs for transfer learning -srcdir=../../wsj/s5/ +srcdir=../../wsj/s5 # base dir for source dataset. +src_mdl=$srcdir/exp/chain/tdnn1d_sp/final.mdl # input dnn model for source data + # that is used in transfer learning. + common_egs_dir= -src_mdl=$srcdir/exp/chain/tdnn1d_sp/final.mdl primary_lr_factor=0.25 # The learning-rate factor for transferred layers from source # model. dim=450 @@ -87,7 +89,7 @@ if [ $stage -le 5 ]; then # Create a version of the lang/ directory that has one state per phone in the # topo file. [note, it really has two states.. the first one is only repeated # once, the second one has zero or more repeats.] - rm -rf $lang + rm -r $lang 2>/dev/null || true cp -r data/lang $lang silphonelist=$(cat $lang/phones/silence.csl) || exit 1; nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1; @@ -125,8 +127,8 @@ EOF # Set the learning-rate-factor to be primary_lr_factor for initial network." # and add new layer to initial model $train_cmd $dir/log/generate_input_mdl.log \ - nnet3-copy --edits="set-learning-rate-factor name=* learning-rate-factor=$primary_lr_factor" $src_mdl - \| \ - nnet3-init --srand=1 - $dir/configs/final.config $dir/input.raw || exit 1; + nnet3-copy --edits="set-learning-rate-factor name=* learning-rate-factor=$primary_lr_factor" $src_mdl - \| \ + nnet3-init --srand=1 - $dir/configs/final.config $dir/input.raw || exit 1; fi if [ $stage -le 8 ]; then diff --git a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1b.sh b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1b.sh index 07ce76c82f2..de8e2729832 100755 --- a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1b.sh +++ b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1b.sh @@ -43,21 +43,27 @@ xent_regularize=0.1 # configs for transfer learning common_egs_dir= -srcdir=../../wsj/s5/ -src_mdl=$srcdir/exp/chain/tdnn1d_sp/final.mdl -src_lang=$srcdir/data/lang -src_gmm_dir=$srcdir/exp/tri4b +srcdir=../../wsj/s5 # base directory for source data +src_mdl=$srcdir/exp/chain/tdnn1d_sp/final.mdl # input dnn model for source data + # that is used in transfer learning. + +src_lang=$srcdir/data/lang # source lang directory used to generate source model. + # new new lang dir for transfer learning prepared + # using source phone set, lexicon in src_lang and + # target word list. + +src_gmm_dir=$srcdir/exp/tri4b # source gmm dir used to generate alignments + # for target data. + src_tree_dir=$srcdir/exp/chain/tree_a_sp # chain tree-dir for src data; # the alignment in target domain is # converted using src-tree primary_lr_factor=0.25 # learning-rate factor for all except last layer in transferring source model final_lr_factor=1.0 # learning-rate factor for final layer in transferring source model. nnet_affix=_online_wsj -tgt_lm_scale=10 -src_lm_scale=1 phone_lm_scales="1,10" # comma-separated list of integer valued scale weights # to scale different phone sequences for different alignments - # e.g. (src-weight,target-weight)=(10,1) + # e.g. (src-weight,target-weight)=(1,10) tdnn_affix=_1b # End configuration section. @@ -80,12 +86,13 @@ fi # run those things. -lang_dir=data/lang_wsj_rm +lang_src_tgt=data/lang_wsj_rm # This dir is prepared using phones.txt and lexicon from + # WSJ and wordlist and G.fst from RM. ali_dir=exp/tri4b${src_tree_dir:+_wsj}_ali lat_dir=exp/tri3b_lats${src_tree_dir:+_wsj} dir=exp/chain/tdnn_wsj_rm${tdnn_affix} -required_files="$src_mdl $src_lang/phones.txt $srcdir/data/local/dict_nosp/lexicon.txt $src_gmm_dir/final.mdl $src_tree_dir/tree" +required_files="$src_mdl $srcdir/exp/nnet3/extractor/final.mdl $src_lang/phones.txt $srcdir/data/local/dict_nosp/lexicon.txt $src_gmm_dir/final.mdl $src_tree_dir/tree" for f in $required_files; do if [ ! -f $f ]; then @@ -96,10 +103,10 @@ done if [ $stage -le -1 ]; then echo "$0: prepare lexicon.txt for RM using WSJ lexicon." if ! cmp -s <(grep -v "^#" $src_lang/phones.txt) <(grep -v "^#" data/lang/phones.txt); then - local/prepare_wsj_rm_lang.sh $srcdir/data/local/dict_nosp $srcdir/data/lang $lang_dir + local/prepare_wsj_rm_lang.sh $srcdir/data/local/dict_nosp $srcdir/data/lang $lang_src_tgt else - rm -rf $lang_dir - cp -r data/lang $lang_dir + rm -rf $lang_src_tgt 2>/dev/null || true + cp -r data/lang $lang_src_tgt fi fi @@ -112,7 +119,7 @@ local/online/run_nnet2_common.sh --stage $stage \ if [ $stage -le 4 ]; then echo "$0: Generate alignment using source model." steps/align_fmllr.sh --nj 100 --cmd "$train_cmd" \ - data/train $lang_dir $src_gmm_dir $ali_dir || exit 1; + data/train $lang_src_tgt $src_gmm_dir $ali_dir || exit 1; fi @@ -121,32 +128,31 @@ if [ $stage -le 5 ]; then # use the same num-jobs as the alignments nj=$(cat exp/tri3b_ali/num_jobs) || exit 1; steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/train \ - $lang_dir $src_gmm_dir $lat_dir || exit 1; + $lang_src_tgt $src_gmm_dir $lat_dir || exit 1; rm $lat_dir/fsts.*.gz # save space fi if [ $stage -le 6 ]; then # set the learning-rate-factor for initial network to be primary_lr_factor." $train_cmd $dir/log/generate_input_mdl.log \ - nnet3-am-copy --raw=true --edits="set-learning-rate-factor name=* learning-rate-factor=$primary_lr_factor; set-learning-rate-factor name=output* learning-rate-factor=1.0" \ - $src_mdl $dir/input.raw || exit 1; + nnet3-am-copy --raw=true --edits="set-learning-rate-factor name=* learning-rate-factor=$primary_lr_factor; set-learning-rate-factor name=output* learning-rate-factor=1.0" \ + $src_mdl $dir/input.raw || exit 1; fi if [ $stage -le 7 ]; then echo "$0: compute {den,normalization}.fst using weighted phone LM with wsj and rm weight $phone_lm_scales." - $train_cmd $dir/log/make_weighted_den_fst.log \ - steps/nnet3/chain/make_weighted_den_fst.sh --weights $phone_lm_scales \ - --lm-opts '--num-extra-lm-states=200' \ - $src_tree_dir $ali_dir $dir || exit 1; + steps/nnet3/chain/make_weighted_den_fst.sh --cmd "$train_cmd" \ + --weights $phone_lm_scales \ + --lm-opts '--num-extra-lm-states=200' \ + $src_tree_dir $ali_dir $dir || exit 1; fi if [ $stage -le 8 ]; then - echo "$0: generate egs for chain to train new model on rm dataset." if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then utils/create_split_dir.pl \ /export/b0{3,4,5,6}/$USER/kaldi-data/egs/rm-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage fi - # exclude phone_LM and den.fst generation training stage + # exclude phone_LM and den.fst generation training stages if [ $train_stage -lt -4 ]; then train_stage=-4 fi @@ -188,7 +194,7 @@ if [ $stage -le 10 ]; then # Note: it might appear that this $lang directory is mismatched, and it is as # far as the 'topo' is concerned, but this script doesn't read the 'topo' from # the lang directory. - utils/mkgraph.sh --self-loop-scale 1.0 $lang_dir $dir $dir/graph + utils/mkgraph.sh --self-loop-scale 1.0 $lang_src_tgt $dir $dir/graph steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ --scoring-opts "--min-lmwt 1" \ --nj 20 --cmd "$decode_cmd" \ diff --git a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1c.sh b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1c.sh index 89d76a3f8d0..dfc003fd9f0 100755 --- a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1c.sh +++ b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1c.sh @@ -49,12 +49,19 @@ xent_regularize=0.1 # configs for transfer learning common_egs_dir= -srcdir=../../wsj/s5 -src_mdl=$srcdir/exp/chain/tdnn1d_sp/final.mdl -src_lang=$srcdir/data/lang -src_tree_dir=$srcdir/exp/chain/tree_a_sp # chain tree-dir for src data; +srcdir=../../wsj/s5 # base directory for source data +src_mdl=$srcdir/exp/chain/tdnn1d_sp/final.mdl # input dnn model for source data + # that is used in transfer learning. + +src_lang=$srcdir/data/lang # source lang directory used to generate source model. + # new new lang dir for transfer learning prepared + # using source phone set, lexicon in src_lang and + # target word list. + +src_tree_dir=$srcdir/exp/chain/tree_a_sp # chain tree-dir for source dataset; # the alignment in target domain is # converted using src-tree + primary_lr_factor=0.25 # learning-rate factor for all except last layer in transferred source model nnet_affix=_online_wsj @@ -82,12 +89,13 @@ fi # run those things. lang=data/lang_chain_5n_wsj -lang_src_tgt=data/lang_wsj_rm +lang_src_tgt=data/lang_wsj_rm # This dir is prepared using phones.txt and lexicon from + # source(WSJ) and and wordlist and G.fst from target(RM) ali_dir=exp/chain/chain_ali_wsj treedir=exp/chain/tri4_5n_tree_wsj lat_dir=exp/chain_lats${src_tree_dir:+_wsj} -required_files="$src_mdl $src_lang/phones.txt $srcdir/data/local/dict_nosp/lexicon.txt $src_tree_dir/tree" +required_files="$src_mdl $srcdir/exp/nnet3/extractor/final.md $src_lang/phones.txt $srcdir/data/local/dict_nosp/lexicon.txt $src_tree_dir/tree" for f in $required_files; do if [ ! -f $f ]; then @@ -100,7 +108,7 @@ if [ $stage -le -1 ]; then if ! cmp -s <(grep -v "^#" $src_lang/phones.txt) <(grep -v "^#" data/lang/phones.txt); then local/prepare_wsj_rm_lang.sh $srcdir/data/local/dict_nosp $src_lang $lang_dir else - rm -rf $lang_dir + rm -rf $lang_dir 2>/dev/null || true cp -r data/lang $lang_dir fi fi @@ -122,13 +130,11 @@ if [ $stage -le 4 ]; then data/train_hires $lang_src_tgt $src_mdl_dir $ali_dir || exit 1; fi -chain_opts=(--chain.alignment-subsampling-factor=1 --chain.left-tolerance=1 --chain.right-tolerance=1) if [ $stage -le 5 ]; then # Get the alignments as lattices (gives the chain training more freedom). # use the same num-jobs as the alignments scale_opts="--transition-scale=1.0 --self-loop-scale=1.0" - nj=$(cat $ali_dir/num_jobs) || exit 1; - steps/nnet3/align_lats.sh --nj $nj --cmd "$train_cmd" \ + steps/nnet3/align_lats.sh --nj 100 --cmd "$train_cmd" \ --acoustic-scale 1.0 --extra-left-context-initial 0 --extra-right-context-final 0 \ --frames-per-chunk $frames_per_chunk \ --scale-opts "$scale_opts" \ @@ -140,20 +146,18 @@ fi if [ $stage -le 6 ]; then # set the learning-rate-factor for initial network to be primary_lr_factor." $train_cmd $dir/log/generate_input_mdl.log \ - nnet3-am-copy --raw=true --edits="set-learning-rate-factor name=* learning-rate-factor=$primary_lr_factor; set-learning-rate-factor name=output* learning-rate-factor=1.0" \ - $src_mdl $dir/input.raw || exit 1; + nnet3-am-copy --raw=true --edits="set-learning-rate-factor name=* learning-rate-factor=$primary_lr_factor; set-learning-rate-factor name=output* learning-rate-factor=1.0" \ + $src_mdl $dir/input.raw || exit 1; fi if [ $stage -le 7 ]; then echo "$0: compute {den,normalization}.fst using weighted phone LM." - $train_cmd $dir/log/make_weighted_den_fst.log \ steps/nnet3/chain/make_weighted_den_fst.sh --weights $phone_lm_scales \ - --lm-opts '--num-extra-lm-states=200' \ - $src_tree_dir $ali_dir $dir || exit 1; + --lm-opts '--num-extra-lm-states=200' \ + $src_tree_dir $ali_dir $dir || exit 1; fi if [ $stage -le 8 ]; then - echo "$0: generate egs for chain to train new model on rm dataset." if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then utils/create_split_dir.pl \ /export/b0{3,4,5,6}/$USER/kaldi-data/egs/rm-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage @@ -162,8 +166,11 @@ if [ $stage -le 8 ]; then if [ $train_stage -lt -4 ]; then train_stage=-4 fi - - steps/nnet3/chain/train_more.py --stage $train_stage ${chain_opts[@]} \ + # we used chain model from source to generate lats for target and the + # tolerance used in chain egs generation using this lats should be 1 or 2 which is + # (source_egs_tolerance/frame_subsampling_factor) + chain_opts=(--chain.alignment-subsampling-factor=1 --chain.left-tolerance=1 --chain.right-tolerance=1) + steps/nnet3/chain/train.py --stage $train_stage ${chain_opts[@]} \ --cmd "$decode_cmd" \ --trainer.input-model $dir/input.raw \ --feat.online-ivector-dir exp/nnet2${nnet_affix}/ivectors \ diff --git a/egs/rm/s5/local/prepare_wsj_rm_lang.sh b/egs/rm/s5/local/prepare_wsj_rm_lang.sh index 9d3c8dd3c2d..8eeb45a90d6 100755 --- a/egs/rm/s5/local/prepare_wsj_rm_lang.sh +++ b/egs/rm/s5/local/prepare_wsj_rm_lang.sh @@ -29,7 +29,7 @@ for f in $required_dict_files; do fi done -rm -r $output_dir 2>/dev/null +rm -r $output_dir 2>/dev/null || true mkdir -p $output_dir mkdir -p $output_dir/local # copy *phones.txt from source to target. diff --git a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py index 4912ac12356..1b0878ea9a8 100644 --- a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py +++ b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py @@ -427,9 +427,12 @@ def compute_preconditioning_matrix(dir, egs_dir, num_lda_jobs, run_opts, def prepare_initial_acoustic_model(dir, run_opts, srand=-1, input_mdl=None): - """ Adds the first layer; this will also add in the lda.mat and - presoftmax_prior_scale.vec. It will also prepare the acoustic model - with the transition model.""" + """ Adds the first layer; It will also prepare the acoustic model + with the transition model. + If input_mdl is specified, no initial network preparation(adding + first layer) is done on that and this model is prepared instead of + '0.raw' acoustice model with the transition model. + """ if input_mdl is None: common_train_lib.prepare_initial_network(dir, run_opts, srand=srand) diff --git a/egs/wsj/s5/steps/libs/nnet3/train/common.py b/egs/wsj/s5/steps/libs/nnet3/train/common.py index e49c74458fe..960d327b72a 100644 --- a/egs/wsj/s5/steps/libs/nnet3/train/common.py +++ b/egs/wsj/s5/steps/libs/nnet3/train/common.py @@ -358,9 +358,15 @@ def parse_generic_config_vars_file(var_file): def parse_input_model(input_model): + """ This function parses input_model and outputs left and right contexts + for input_mdl. This function is an alternative to configs/vars, + if this file is not available. + e.g. input_mdl is not generated using + configs and directly passed to train.py + using --trainer.input-model. + """ variables = {} try: - assert(os.path.exists(input_model)) out = common_lib.get_command_stdout("""nnet3-info {0} | """ """head -4 """.format(input_model)) # out looks like this @@ -381,6 +387,7 @@ def parse_input_model(input_model): pass return variables + def verify_egs_dir(egs_dir, feat_dim, ivector_dim, ivector_extractor_id, left_context, right_context, left_context_initial=-1, right_context_final=-1): diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py index f4b262732c0..4ad698085aa 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py @@ -1074,6 +1074,7 @@ def get_full_config(self): ans.append(('ref', line)) return ans + class XconfigAuxiliaryLayer(XconfigLayerBase): """This class is for lines like 'auxiliary name=aux dim=40' @@ -1084,7 +1085,7 @@ class XconfigAuxiliaryLayer(XconfigLayerBase): of that model as input to new layers. """ - def __init__(self, first_token, key_to_value, prev_names = None): + def __init__(self, first_token, key_to_value, prev_names=None): assert first_token == 'auxiliary' XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names) @@ -1104,13 +1105,13 @@ def get_input_descriptor_names(self): return [] # there is no 'input' field in self.config. - def output_name(self, auxiliary_outputs = None): + def output_name(self, auxiliary_outputs=None): # there are no auxiliary outputs as this layer will just pass the input assert auxiliary_outputs is None return self.name - def output_dim(self, auxiliary_outputs = None): + def output_dim(self, auxiliary_outputs=None): # there are no auxiliary outputs as this layer will just pass the input assert auxiliary_outputs is None @@ -1118,13 +1119,9 @@ def output_dim(self, auxiliary_outputs = None): def get_full_config(self): - # unlike other layers the input layers need to be printed in - # 'init.config' (which initializes the neural network prior to the LDA) + # unlike other layers the auxiliary layers should not to be printed in + # any '*.config' ans = [] - for config_name in [ 'init', 'ref', 'final' ]: - ans.append( (config_name, - 'auxiliary-node name={0} dim={1}'.format(self.name, - self.config['dim']))) return ans diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/utils.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/utils.py index bfae725ec6b..0cd22975c26 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/utils.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/utils.py @@ -14,7 +14,7 @@ # Given a list of objects of type XconfigLayerBase ('all_layers'), # including at least the layers preceding 'current_layer' (and maybe # more layers), return the names of layers preceding 'current_layer' -# regardless of layers type 'auxiliary'. +# other than layers of type 'auxiliary'. # This will be used in parsing expressions like [-1] in descriptors # (which is an alias for the previous layer). def get_prev_names(all_layers, current_layer): diff --git a/egs/wsj/s5/steps/nnet3/align_lats.sh b/egs/wsj/s5/steps/nnet3/align_lats.sh new file mode 100755 index 00000000000..a5975989cf6 --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/align_lats.sh @@ -0,0 +1,190 @@ +#!/bin/bash +# Copyright 2012 Brno University of Technology (Author: Karel Vesely) +# 2013 Johns Hopkins University (Author: Daniel Povey) +# 2015 Vijayaditya Peddinti +# 2016 Vimal Manohar +# Apache 2.0 + +# Computes training alignments using nnet3 DNN + +# Begin configuration section. +nj=4 +cmd=run.pl +stage=-1 +# Begin configuration. +scale_opts="--transition-scale=1.0 --self-loop-scale=0.1" +acoustic_scale=0.1 +beam=20 +transform_dir= +iter=final +frames_per_chunk=50 +extra_left_context=0 +extra_right_context=0 +extra_left_context_initial=-1 +extra_right_context_final=-1 +online_ivector_dir= +feat_type= # you can set this to force it to use delta features. +graphs_scp= +# End configuration options. + +echo "$0 $@" # Print the command line for logging + +[ -f path.sh ] && . ./path.sh # source the path. +. parse_options.sh || exit 1; + +if [ $# != 4 ]; then + echo "Usage: $0 [--transform-dir ] " + echo "e.g.: $0 data/train data/lang exp/nnet4 exp/nnet4_ali" + echo "main options (for others, see top of script file)" + echo " --config # config containing options" + echo " --nj # number of parallel jobs" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + exit 1; +fi + +data=$1 +lang=$2 +srcdir=$3 +dir=$4 + +oov=`cat $lang/oov.int` || exit 1; +mkdir -p $dir/log +echo $nj > $dir/num_jobs +sdata=$data/split${nj}utt +[[ -d $sdata && $data/feats.scp -ot $sdata ]] || \ + split_data.sh --per-utt $data $nj || exit 1; + +extra_files= +if [ ! -z "$online_ivector_dir" ]; then + steps/nnet2/check_ivectors_compatible.sh $srcdir $online_ivector_dir || exit 1 + extra_files="$online_ivector_dir/ivector_online.scp $online_ivector_dir/ivector_period" +fi + +for f in $srcdir/tree $srcdir/${iter}.mdl $data/feats.scp $lang/L.fst $extra_files; do + [ ! -f $f ] && echo "$0: no such file $f" && exit 1; +done + +cp $srcdir/{tree,${iter}.mdl} $dir || exit 1; + +utils/lang/check_phones_compatible.sh $lang/phones.txt $srcdir/phones.txt || exit 1; +cp $lang/phones.txt $dir || exit 1; +## Set up features. Note: these are different from the normal features +## because we have one rspecifier that has the features for the entire +## training set, not separate ones for each batch. +if [ -z "$feat_type" ]; then + if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=raw; fi +fi +echo "$0: feature type is $feat_type" + +cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null` +cp $srcdir/cmvn_opts $dir 2>/dev/null + +case $feat_type in + raw) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |" + ;; + lda) + splice_opts=`cat $srcdir/splice_opts 2>/dev/null` + cp $srcdir/splice_opts $dir 2>/dev/null + cp $srcdir/final.mat $dir || exit 1; + feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |" + ;; + *) echo "$0: invalid feature type $feat_type" && exit 1; +esac + +if [ ! -z "$transform_dir" ]; then + echo "$0: using transforms from $transform_dir" + [ ! -s $transform_dir/num_jobs ] && \ + echo "$0: expected $transform_dir/num_jobs to contain the number of jobs." && exit 1; + nj_orig=$(cat $transform_dir/num_jobs) + + if [ $feat_type == "raw" ]; then trans=raw_trans; + else trans=trans; fi + if [ $feat_type == "lda" ] && ! cmp $transform_dir/final.mat $srcdir/final.mat; then + echo "$0: LDA transforms differ between $srcdir and $transform_dir" + exit 1; + fi + if [ ! -f $transform_dir/$trans.1 ]; then + echo "$0: expected $transform_dir/$trans.1 to exist (--transform-dir option)" + exit 1; + fi + if [ $nj -ne $nj_orig ]; then + # Copy the transforms into an archive with an index. + for n in $(seq $nj_orig); do cat $transform_dir/$trans.$n; done | \ + copy-feats ark:- ark,scp:$dir/$trans.ark,$dir/$trans.scp || exit 1; + feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk scp:$dir/$trans.scp ark:- ark:- |" + else + # number of jobs matches with alignment dir. + feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/$trans.JOB ark:- ark:- |" + fi +fi + +ivector_opts= +if [ ! -z "$online_ivector_dir" ]; then + ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1; + ivector_opts="--online-ivectors=scp:$online_ivector_dir/ivector_online.scp --online-ivector-period=$ivector_period" +fi + +echo "$0: aligning data in $data using model from $srcdir, putting alignments in $dir" + +frame_subsampling_opt= +if [ -f $srcdir/frame_subsampling_factor ]; then + # e.g. for 'chain' systems + frame_subsampling_factor=$(cat $srcdir/frame_subsampling_factor) + frame_subsampling_opt="--frame-subsampling-factor=$frame_subsampling_factor" + cp $srcdir/frame_subsampling_factor $dir + if [ "$frame_subsampling_factor" -gt 1 ] && \ + [ "$scale_opts" == "--transition-scale=1.0 --self-loop-scale=0.1" ]; then + echo "$0: frame-subsampling-factor is not 1 (so likely a chain system)," + echo "... but the scale opts are the defaults. You probably want" + echo "--scale-opts '--transition-scale=1.0 --self-loop-scale=1.0'" + sleep 1 + fi +fi + +if [ ! -z "$graphs_scp" ]; then + if [ ! -f $graphs_scp ]; then + echo "Could not find graphs $graphs_scp" && exit 1 + fi + tra="scp:utils/filter_scp.pl $sdata/JOB/utt2spk $graphs_scp |" + prog=compile-train-graphs-fsts +else + tra="ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $sdata/JOB/text|"; + prog=compile-train-graphs +fi + +if [ $stage -le 0 ]; then + ## because nnet3-latgen-faster doesn't support adding the transition-probs to the + ## graph itself, we need to bake them into the compiled graphs. This means we can't reuse previously compiled graphs, + ## because the other scripts write them without transition probs. + $cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log \ + $prog --read-disambig-syms=$lang/phones/disambig.int \ + $scale_opts \ + $dir/tree $srcdir/${iter}.mdl $lang/L.fst "$tra" \ + "ark:|gzip -c >$dir/fsts.JOB.gz" || exit 1 +fi + +if [ $stage -le 1 ]; then + # Warning: nnet3-latgen-faster doesn't support a retry-beam so you may get more + # alignment errors (however, it does have a default min-active=200 so this + # will tend to reduce alignment errors). + # --allow_partial=false makes sure we reach the end of the decoding graph. + # --word-determinize=false makes sure we retain the alternative pronunciations of + # words (including alternatives regarding optional silences). + # --lattice-beam=$beam keeps all the alternatives that were within the beam, + # it means we do no pruning of the lattice (lattices from a training transcription + # will be small anyway). + $cmd JOB=1:$nj $dir/log/generate_lattices.JOB.log \ + nnet3-latgen-faster --acoustic-scale=$acoustic_scale $ivector_opts $frame_subsampling_opt \ + --frames-per-chunk=$frames_per_chunk \ + --extra-left-context=$extra_left_context \ + --extra-right-context=$extra_right_context \ + --extra-left-context-initial=$extra_left_context_initial \ + --extra-right-context-final=$extra_right_context_final \ + --beam=$beam --lattice-beam=$beam \ + --allow-partial=false --word-determinize=false \ + $srcdir/${iter}.mdl "ark:gunzip -c $dir/fsts.JOB.gz |" \ + "$feats" "ark:|gzip -c >$dir/lat.JOB.gz" || exit 1; +fi + +echo "$0: done aligning data." + diff --git a/egs/wsj/s5/steps/nnet3/chain/make_weighted_den_fst.sh b/egs/wsj/s5/steps/nnet3/chain/make_weighted_den_fst.sh index cd284dbfe3e..bbac3807e9d 100755 --- a/egs/wsj/s5/steps/nnet3/chain/make_weighted_den_fst.sh +++ b/egs/wsj/s5/steps/nnet3/chain/make_weighted_den_fst.sh @@ -1,7 +1,7 @@ #!/bin/bash -# Copyright 2014-17 Vimal Manohar -# 2017 Pegah Ghahremani +# Copyright 2017 Vimal Manohar +# 2017 Pegah Ghahremani # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -32,24 +32,23 @@ weights= # comma-separated list of integer valued scale weights used lm_opts='num_extra_lm_state=2000' #end configuration section. -help_message="Usage: "$(basename $0)" [options] [ ...] - E.g. "$(basename $0)" exp/tri1_ali exp/tri2_ali exp/chain/tdnn_1a_sp -Options: - --cmd (run.pl|queue.pl...) # specify how to run the sub-processes. -"; [ -f ./path.sh ] && . ./path.sh . parse_options.sh || exit 1; if [ $# -lt 2 ]; then - printf "$help_message\n"; + echo "Usage: $0 [options] [ ...] "; + echo "e.g.: $0 exp/tri1_ali exp/tri2_ali exp/chain/tdnn_1a_sp"; + echo "Options: " + echo " --cmd (run.pl|queue.pl...) # specify how to run the sub-processes."; + echo "--lm-opts # options for phone LM generation"; exit 1; fi dir=${@: -1} # last argument to the script ali_dirs=( $@ ) # read the remaining arguments into an array unset ali_dirs[${#ali_dirs[@]}-1] # 'pop' the last argument which is odir -num_alignments=${#ali_dirs[@]} # number of systems to combine +num_alignments=${#ali_dirs[@]} # number of alignment dirs to combine mkdir -p $dir/log for n in `seq 0 $[$num_alignments-1]`;do @@ -61,7 +60,7 @@ for n in `seq 0 $[$num_alignments-1]`;do fi done utils/lang/check_phones_compatible.sh ${ali_dirs[0]}/phones.txt \ - ${ali_dirs[$n]}/phones.txt + ${ali_dirs[$n]}/phones.txt || exit 1; done cp ${ali_dirs[0]}/tree $dir/ || exit 1 @@ -70,22 +69,22 @@ if [ $stage -le 1 ]; then for n in `seq 0 $[num_alignments-1]`; do w=$(echo $weights | cut -d, -f$[$n+1]) if ! [[ $w =~ ^[+]?[0-9]+$ ]] ; then - echo "no positive integer weight specified for alignment ${ali_dirs[$n]}" && exit 1; + echo "no positive int weight specified for alignment ${ali_dirs[$n]}" && exit 1; fi done $cmd $dir/log/make_phone_lm_fst.log \ - ali_dirs=\(${ali_dirs[@]}\) \; \ - for n in `seq 0 $[num_alignments-1]`\; do \ - adir=\${ali_dirs[\$n]} \; \ - w=\$\(echo $weights \| cut -d, -f\$[\$n+1]\) \; \ - for x in \$\(seq \$w\)\; do gunzip -c \$adir/ali.*.gz \; done \| \ - ali-to-phones \$adir/final.mdl ark:- ark:- \; \ - done \| \ - chain-est-phone-lm $lm_opts ark:- $dir/phone_lm.fst + ali_dirs=\(${ali_dirs[@]}\) \; \ + for n in `seq 0 $[num_alignments-1]`\; do \ + adir=\${ali_dirs[\$n]} \; \ + w=\$\(echo $weights \| cut -d, -f\$[\$n+1]\) \; \ + for x in \$\(seq \$w\)\; do gunzip -c \$adir/ali.*.gz \; done \| \ + ali-to-phones \$adir/final.mdl ark:- ark:- \; \ + done \| \ + chain-est-phone-lm $lm_opts ark:- $dir/phone_lm.fst || exit 1; fi if [ $stage -le 2 ]; then - copy-transition-model ${ali_dirs[0]}/final.mdl $dir/0.trans_mdl + copy-transition-model ${ali_dirs[0]}/final.mdl $dir/0.trans_mdl || exit 1; fi if [ $stage -le 3 ]; then diff --git a/egs/wsj/s5/steps/nnet3/chain/train.py b/egs/wsj/s5/steps/nnet3/chain/train.py index ffca01bc31d..403969991f1 100755 --- a/egs/wsj/s5/steps/nnet3/chain/train.py +++ b/egs/wsj/s5/steps/nnet3/chain/train.py @@ -323,7 +323,8 @@ def train(args, run_opts): shutil.copy('{0}/tree'.format(args.tree_dir), args.dir) chain_lib.create_denominator_fst(args.dir, args.tree_dir, run_opts) - if (args.stage <= -4) and os.path.exists(args.dir+"/configs/init.config") and args.input_model is None: + if (args.stage <= -4) and os.path.exists(args.dir+"/configs/init.config") + and args.input_model is None: logger.info("Initializing a basic network for estimating " "preconditioning matrix") common_lib.execute_command( @@ -345,9 +346,12 @@ def train(args, run_opts): default_egs_dir = '{0}/egs'.format(args.dir) if (args.stage <= -3) and args.egs_dir is None: logger.info("Generating egs") - assert(os.path.exists("{0}/den.fst".format(args.dir)) and - os.path.exists("{0}/normalization.fst".format(args.dir)) and - os.path.exists("{0}/tree".format(args.dir))) + if (not os.path.exists("{0}/den.fst".format(args.dir)) or + not os.path.exists("{0}/normalization.fst".format(args.dir)) or + not os.path.exists("{0}/tree".format(args.dir))): + raise Exception("Chain egs generation expects {0}/den.fst, " + "{0}/normalization.fst and {0}/tree " + "to exist.".format(args.dir)) # this is where get_egs.sh is called. chain_lib.generate_chain_egs( dir=args.dir, data=args.feat_dir, @@ -404,7 +408,7 @@ def train(args, run_opts): if (args.stage <= -1): logger.info("Preparing the initial acoustic model.") - chain_lib.prepare_initial_acoustic_model(args.dir, run_opts,input_mdl=args.input_model) + chain_lib.prepare_initial_acoustic_model(args.dir, run_opts, input_mdl=args.input_model) with open("{0}/frame_subsampling_factor".format(args.dir), "w") as f: f.write(str(args.frame_subsampling_factor)) diff --git a/egs/wsj/s5/steps/nnet3/xconfig_to_configs.py b/egs/wsj/s5/steps/nnet3/xconfig_to_configs.py index 8204ec63eaf..3c81ec12b95 100755 --- a/egs/wsj/s5/steps/nnet3/xconfig_to_configs.py +++ b/egs/wsj/s5/steps/nnet3/xconfig_to_configs.py @@ -233,9 +233,10 @@ def add_nnet_context_info(config_dir, nnet_edits=None, """Create the 'vars' file that specifies model_left_context, etc.""" common_lib.execute_command("nnet3-init {0} {1}/ref.config " - "{1}/ref.raw".format(existing_model if - existing_model is not None else "", - config_dir)) + "{1}/ref.raw".format( + existing_model if + existing_model is not None else "", + config_dir)) model = "{0}/ref.raw".format(config_dir) if nnet_edits is not None: model = "nnet3-copy --edits='{0}' {1} - |".format(nnet_edits, @@ -268,9 +269,10 @@ def check_model_contexts(config_dir, nnet_edits=None, existing_model=None): if os.path.exists('{0}/{1}.config'.format(config_dir, file_name)): contexts[file_name] = {} common_lib.execute_command("nnet3-init {0} {1}/{2}.config " - "{1}/{2}.raw".format(existing_model if - existing_model is not None else '', - config_dir, file_name)) + "{1}/{2}.raw".format( + existing_model if + existing_model is not None else '', + config_dir, file_name)) model = "{0}/{1}.raw".format(config_dir, file_name) if nnet_edits is not None: model = "nnet3-copy --edits='{0}' {1} - |".format(nnet_edits, From ac95720f9183db260b957f15efb1792d5d686a00 Mon Sep 17 00:00:00 2001 From: Pegita Date: Tue, 15 Aug 2017 19:50:04 -0400 Subject: [PATCH 040/174] fixed some issues. --- .../local/chain/tuning/run_tdnn_wsj_rm_1a.sh | 10 +++-- .../local/chain/tuning/run_tdnn_wsj_rm_1b.sh | 35 ++++++++-------- .../local/chain/tuning/run_tdnn_wsj_rm_1c.sh | 41 ++++++++++--------- .../steps/libs/nnet3/xconfig/basic_layers.py | 6 --- egs/wsj/s5/steps/nnet3/align_lats.sh | 39 ++++-------------- egs/wsj/s5/steps/nnet3/chain/train.py | 4 +- 6 files changed, 58 insertions(+), 77 deletions(-) diff --git a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1a.sh b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1a.sh index 651f43093ae..fb2e3e4c351 100755 --- a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1a.sh +++ b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1a.sh @@ -31,9 +31,7 @@ xent_regularize=0.1 # configs for transfer learning srcdir=../../wsj/s5 # base dir for source dataset. -src_mdl=$srcdir/exp/chain/tdnn1d_sp/final.mdl # input dnn model for source data - # that is used in transfer learning. - +src_tdnn_affix=1d common_egs_dir= primary_lr_factor=0.25 # The learning-rate factor for transferred layers from source # model. @@ -54,8 +52,12 @@ If you want to use GPUs (and have them), go to src/, and configure and make on a where "nvcc" is installed. EOF fi +src_mdl=$srcdir/exp/chain/tdnn${src_tdnn_affix}_sp/final.mdl # input dnn model for source data + # that is used in transfer learning. +src_extractor_dir=$srcdir/exp/nnet3/extractor # source extractor dir used to extract + # ivector for target data. -required_files="$src_mdl $srcdir/exp/nnet3/extractor/final.mdl" +required_files="$src_mdl $src_extractor_dir/final.dubm $src_extractor_dir/final.mat $src_extractor_dir/final.ie" for f in $required_files; do if [ ! -f $f ]; then echo "$0: no such file $f" diff --git a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1b.sh b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1b.sh index de8e2729832..b39857d79f4 100755 --- a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1b.sh +++ b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1b.sh @@ -44,20 +44,7 @@ xent_regularize=0.1 # configs for transfer learning common_egs_dir= srcdir=../../wsj/s5 # base directory for source data -src_mdl=$srcdir/exp/chain/tdnn1d_sp/final.mdl # input dnn model for source data - # that is used in transfer learning. - -src_lang=$srcdir/data/lang # source lang directory used to generate source model. - # new new lang dir for transfer learning prepared - # using source phone set, lexicon in src_lang and - # target word list. - -src_gmm_dir=$srcdir/exp/tri4b # source gmm dir used to generate alignments - # for target data. - -src_tree_dir=$srcdir/exp/chain/tree_a_sp # chain tree-dir for src data; - # the alignment in target domain is - # converted using src-tree +src_tdnn_affix=1d primary_lr_factor=0.25 # learning-rate factor for all except last layer in transferring source model final_lr_factor=1.0 # learning-rate factor for final layer in transferring source model. nnet_affix=_online_wsj @@ -92,11 +79,27 @@ ali_dir=exp/tri4b${src_tree_dir:+_wsj}_ali lat_dir=exp/tri3b_lats${src_tree_dir:+_wsj} dir=exp/chain/tdnn_wsj_rm${tdnn_affix} -required_files="$src_mdl $srcdir/exp/nnet3/extractor/final.mdl $src_lang/phones.txt $srcdir/data/local/dict_nosp/lexicon.txt $src_gmm_dir/final.mdl $src_tree_dir/tree" +# src directories +src_extractor_dir=$srcdir/exp/nnet3/extractor +src_mdl=$srcdir/exp/chain/tdnn${src_tdnn_affix}_sp/final.mdl # input dnn model for source data + # that is used in transfer learning. + +src_lang=$srcdir/data/lang # source lang directory used to generate source model. + # new new lang dir for transfer learning prepared + # using source phone set, lexicon in src_lang and + # target word list. + +src_gmm_dir=$srcdir/exp/tri4b # source gmm dir used to generate alignments + # for target data. + +src_tree_dir=$srcdir/exp/chain/tree_a_sp # chain tree-dir for src data; + # the alignment in target domain is + # converted using src-tree +required_files="$src_mdl $src_extractor_dir/final.dubm $src_extractor_dir/final.mat $src_extractor_dir/final.ie $src_lang/phones.txt $srcdir/data/local/dict_nosp/lexicon.txt $src_gmm_dir/final.mdl $src_tree_dir/tree" for f in $required_files; do if [ ! -f $f ]; then - echo "$0: no such file $f" + echo "$0: no such file $f" && exit 1; fi done diff --git a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1c.sh b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1c.sh index dfc003fd9f0..3a2b04a401c 100755 --- a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1c.sh +++ b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1c.sh @@ -50,17 +50,7 @@ xent_regularize=0.1 # configs for transfer learning common_egs_dir= srcdir=../../wsj/s5 # base directory for source data -src_mdl=$srcdir/exp/chain/tdnn1d_sp/final.mdl # input dnn model for source data - # that is used in transfer learning. - -src_lang=$srcdir/data/lang # source lang directory used to generate source model. - # new new lang dir for transfer learning prepared - # using source phone set, lexicon in src_lang and - # target word list. - -src_tree_dir=$srcdir/exp/chain/tree_a_sp # chain tree-dir for source dataset; - # the alignment in target domain is - # converted using src-tree +src_tdnn_affix=1d primary_lr_factor=0.25 # learning-rate factor for all except last layer in transferred source model nnet_affix=_online_wsj @@ -95,11 +85,25 @@ ali_dir=exp/chain/chain_ali_wsj treedir=exp/chain/tri4_5n_tree_wsj lat_dir=exp/chain_lats${src_tree_dir:+_wsj} -required_files="$src_mdl $srcdir/exp/nnet3/extractor/final.md $src_lang/phones.txt $srcdir/data/local/dict_nosp/lexicon.txt $src_tree_dir/tree" +# src directories +src_extractor_dir=$srcdir/exp/nnet3/extractor +src_mdl=$srcdir/exp/chain/tdnn${src_tdnn_affix}_sp/final.mdl # input dnn model for source data + # that is used in transfer learning. + +src_lang=$srcdir/data/lang # source lang directory used to generate source model. + # new new lang dir for transfer learning prepared + # using source phone set, lexicon in src_lang and + # target word list. + +src_tree_dir=$srcdir/exp/chain/tree_a_sp # chain tree-dir for source dataset; + # the alignment in target domain is + # converted using src-tree + +required_files="$src_mdl $src_extractor_dir/final.dubm $src_extractor_dir/final.mat $src_extractor_dir/final.ie $src_lang/phones.txt $srcdir/data/local/dict_nosp/lexicon.txt $src_tree_dir/tree" for f in $required_files; do if [ ! -f $f ]; then - echo "$0: no such file $f" + echo "$0: no such file $f" && exit 1; fi done @@ -121,11 +125,10 @@ local/online/run_nnet2_common.sh --stage $stage \ src_mdl_dir=`dirname $src_mdl` if [ $stage -le 4 ]; then echo "$0: Generate alignment using source chain model." - scale_opts="--transition-scale=1.0 --acoustic-scale=1.0 --self-loop-scale=1.0" steps/nnet3/align.sh --nj 100 --cmd "$train_cmd" \ --online-ivector-dir exp/nnet2${nnet_affix}/ivectors \ --extra-left-context-initial 0 --extra-right-context-final 0 \ - --scale-opts "$scale_opts" \ + --scale-opts "--transition-scale=1.0 --acoustic-scale=1.0 --self-loop-scale=1." \ --frames-per-chunk $frames_per_chunk \ data/train_hires $lang_src_tgt $src_mdl_dir $ali_dir || exit 1; fi @@ -133,13 +136,12 @@ fi if [ $stage -le 5 ]; then # Get the alignments as lattices (gives the chain training more freedom). # use the same num-jobs as the alignments - scale_opts="--transition-scale=1.0 --self-loop-scale=1.0" steps/nnet3/align_lats.sh --nj 100 --cmd "$train_cmd" \ --acoustic-scale 1.0 --extra-left-context-initial 0 --extra-right-context-final 0 \ --frames-per-chunk $frames_per_chunk \ - --scale-opts "$scale_opts" \ + --scale-opts "--transition-scale=1.0 --self-loop-scale=1." \ --online-ivector-dir exp/nnet2${nnet_affix}/ivectors data/train_hires \ - $lang_src_tgt $ali_dir $lat_dir || exit 1; + $lang_src_tgt $src_mdl_dir $lat_dir || exit 1; rm $lat_dir/fsts.*.gz # save space fi @@ -166,9 +168,10 @@ if [ $stage -le 8 ]; then if [ $train_stage -lt -4 ]; then train_stage=-4 fi - # we used chain model from source to generate lats for target and the + # we use chain model from source to generate lats for target and the # tolerance used in chain egs generation using this lats should be 1 or 2 which is # (source_egs_tolerance/frame_subsampling_factor) + # source_egs_tolerance = 5 chain_opts=(--chain.alignment-subsampling-factor=1 --chain.left-tolerance=1 --chain.right-tolerance=1) steps/nnet3/chain/train.py --stage $train_stage ${chain_opts[@]} \ --cmd "$decode_cmd" \ diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py index 4ad698085aa..fde10ba3d00 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py @@ -1092,33 +1092,27 @@ def __init__(self, first_token, key_to_value, prev_names=None): def set_default_configs(self): - self.config = { 'dim': -1} def check_configs(self): - if self.config['dim'] <= 0: raise RuntimeError("Dimension of auxiliary-layer '{0}'" "should be positive.".format(self.name)) def get_input_descriptor_names(self): - return [] # there is no 'input' field in self.config. def output_name(self, auxiliary_outputs=None): - # there are no auxiliary outputs as this layer will just pass the input assert auxiliary_outputs is None return self.name def output_dim(self, auxiliary_outputs=None): - # there are no auxiliary outputs as this layer will just pass the input assert auxiliary_outputs is None return self.config['dim'] def get_full_config(self): - # unlike other layers the auxiliary layers should not to be printed in # any '*.config' ans = [] diff --git a/egs/wsj/s5/steps/nnet3/align_lats.sh b/egs/wsj/s5/steps/nnet3/align_lats.sh index a5975989cf6..f931b826b7b 100755 --- a/egs/wsj/s5/steps/nnet3/align_lats.sh +++ b/egs/wsj/s5/steps/nnet3/align_lats.sh @@ -23,7 +23,6 @@ extra_right_context=0 extra_left_context_initial=-1 extra_right_context_final=-1 online_ivector_dir= -feat_type= # you can set this to force it to use delta features. graphs_scp= # End configuration options. @@ -71,25 +70,12 @@ cp $lang/phones.txt $dir || exit 1; ## Set up features. Note: these are different from the normal features ## because we have one rspecifier that has the features for the entire ## training set, not separate ones for each batch. -if [ -z "$feat_type" ]; then - if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=raw; fi -fi -echo "$0: feature type is $feat_type" +echo "$0: feature type is raw" cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null` cp $srcdir/cmvn_opts $dir 2>/dev/null -case $feat_type in - raw) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |" - ;; - lda) - splice_opts=`cat $srcdir/splice_opts 2>/dev/null` - cp $srcdir/splice_opts $dir 2>/dev/null - cp $srcdir/final.mat $dir || exit 1; - feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |" - ;; - *) echo "$0: invalid feature type $feat_type" && exit 1; -esac +feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |" if [ ! -z "$transform_dir" ]; then echo "$0: using transforms from $transform_dir" @@ -97,24 +83,18 @@ if [ ! -z "$transform_dir" ]; then echo "$0: expected $transform_dir/num_jobs to contain the number of jobs." && exit 1; nj_orig=$(cat $transform_dir/num_jobs) - if [ $feat_type == "raw" ]; then trans=raw_trans; - else trans=trans; fi - if [ $feat_type == "lda" ] && ! cmp $transform_dir/final.mat $srcdir/final.mat; then - echo "$0: LDA transforms differ between $srcdir and $transform_dir" - exit 1; - fi - if [ ! -f $transform_dir/$trans.1 ]; then - echo "$0: expected $transform_dir/$trans.1 to exist (--transform-dir option)" + if [ ! -f $transform_dir/raw_trans.1 ]; then + echo "$0: expected $transform_dir/raw_trans.1 to exist (--transform-dir option)" exit 1; fi if [ $nj -ne $nj_orig ]; then # Copy the transforms into an archive with an index. - for n in $(seq $nj_orig); do cat $transform_dir/$trans.$n; done | \ - copy-feats ark:- ark,scp:$dir/$trans.ark,$dir/$trans.scp || exit 1; - feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk scp:$dir/$trans.scp ark:- ark:- |" + for n in $(seq $nj_orig); do cat $transform_dir/raw_trans.$n; done | \ + copy-feats ark:- ark,scp:$dir/raw_trans.ark,$dir/raw_trans.scp || exit 1; + feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk scp:$dir/raw_trans.scp ark:- ark:- |" else # number of jobs matches with alignment dir. - feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/$trans.JOB ark:- ark:- |" + feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/raw_trans.JOB ark:- ark:- |" fi fi @@ -167,7 +147,7 @@ if [ $stage -le 1 ]; then # Warning: nnet3-latgen-faster doesn't support a retry-beam so you may get more # alignment errors (however, it does have a default min-active=200 so this # will tend to reduce alignment errors). - # --allow_partial=false makes sure we reach the end of the decoding graph. + # --allow_partial=false makes sure we reach the end of the decoding graph. # --word-determinize=false makes sure we retain the alternative pronunciations of # words (including alternatives regarding optional silences). # --lattice-beam=$beam keeps all the alternatives that were within the beam, @@ -187,4 +167,3 @@ if [ $stage -le 1 ]; then fi echo "$0: done aligning data." - diff --git a/egs/wsj/s5/steps/nnet3/chain/train.py b/egs/wsj/s5/steps/nnet3/chain/train.py index 403969991f1..ec70a3f4408 100755 --- a/egs/wsj/s5/steps/nnet3/chain/train.py +++ b/egs/wsj/s5/steps/nnet3/chain/train.py @@ -323,8 +323,8 @@ def train(args, run_opts): shutil.copy('{0}/tree'.format(args.tree_dir), args.dir) chain_lib.create_denominator_fst(args.dir, args.tree_dir, run_opts) - if (args.stage <= -4) and os.path.exists(args.dir+"/configs/init.config") - and args.input_model is None: + if ((args.stage <= -4) and (os.path.exists("{0}/configs/init.config".format(args.dir))) + and (args.input_model is None)): logger.info("Initializing a basic network for estimating " "preconditioning matrix") common_lib.execute_command( From ed8b952c545858637b9208c69261af495de8052b Mon Sep 17 00:00:00 2001 From: Pegita Date: Thu, 17 Aug 2017 12:40:51 -0400 Subject: [PATCH 041/174] fixed some comments and removed some options. --- .../local/chain/tuning/run_tdnn_wsj_rm_1a.sh | 6 ++-- .../local/chain/tuning/run_tdnn_wsj_rm_1b.sh | 24 +++++++-------- .../local/chain/tuning/run_tdnn_wsj_rm_1c.sh | 30 +++++++++---------- 3 files changed, 29 insertions(+), 31 deletions(-) diff --git a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1a.sh b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1a.sh index fb2e3e4c351..e9f2f32828c 100755 --- a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1a.sh +++ b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1a.sh @@ -19,9 +19,7 @@ dir=exp/chain/tdnn_wsj_rm_1a num_epochs=2 initial_effective_lrate=0.005 final_effective_lrate=0.0005 -leftmost_questions_truncate=-1 max_param_change=2.0 -final_layer_normalize_target=0.5 num_jobs_initial=2 num_jobs_final=4 minibatch_size=128 @@ -36,7 +34,7 @@ common_egs_dir= primary_lr_factor=0.25 # The learning-rate factor for transferred layers from source # model. dim=450 -nnet_affix=_online +nnet_affix=_online_wsj # End configuration section. echo "$0 $@" # Print the command line for logging @@ -76,7 +74,7 @@ local/online/run_nnet2_common.sh --stage $stage \ --ivector-dim 100 \ --nnet-affix "$nnet_affix" \ --mfcc-config $srcdir/conf/mfcc_hires.conf \ - --extractor $srcdir/exp/nnet3/extractor || exit 1; + --extractor $src_extractor_dir || exit 1; if [ $stage -le 4 ]; then # Get the alignments as lattices (gives the chain training more freedom). diff --git a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1b.sh b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1b.sh index b39857d79f4..d688e1c9e99 100755 --- a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1b.sh +++ b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1b.sh @@ -31,15 +31,12 @@ get_egs_stage=-10 num_epochs=2 initial_effective_lrate=0.005 final_effective_lrate=0.0005 -leftmost_questions_truncate=-1 max_param_change=2.0 -final_layer_normalize_target=0.5 num_jobs_initial=2 num_jobs_final=4 -minibatch_size=32 +minibatch_size=128 frames_per_eg=150 remove_egs=false -xent_regularize=0.1 # configs for transfer learning common_egs_dir= @@ -73,12 +70,6 @@ fi # run those things. -lang_src_tgt=data/lang_wsj_rm # This dir is prepared using phones.txt and lexicon from - # WSJ and wordlist and G.fst from RM. -ali_dir=exp/tri4b${src_tree_dir:+_wsj}_ali -lat_dir=exp/tri3b_lats${src_tree_dir:+_wsj} -dir=exp/chain/tdnn_wsj_rm${tdnn_affix} - # src directories src_extractor_dir=$srcdir/exp/nnet3/extractor src_mdl=$srcdir/exp/chain/tdnn${src_tdnn_affix}_sp/final.mdl # input dnn model for source data @@ -95,6 +86,15 @@ src_gmm_dir=$srcdir/exp/tri4b # source gmm dir used to generate alignments src_tree_dir=$srcdir/exp/chain/tree_a_sp # chain tree-dir for src data; # the alignment in target domain is # converted using src-tree + +# dirs for src-to-tgt transfer learning experiment +lang_src_tgt=data/lang_wsj_rm # This dir is prepared using phones.txt and lexicon from + # WSJ and wordlist and G.fst from RM. +ali_dir=exp/tri4b${src_tree_dir:+_wsj}_ali +lat_dir=exp/tri3b_lats${src_tree_dir:+_wsj} +dir=exp/chain/tdnn_wsj_rm${tdnn_affix} + + required_files="$src_mdl $src_extractor_dir/final.dubm $src_extractor_dir/final.mat $src_extractor_dir/final.ie $src_lang/phones.txt $srcdir/data/local/dict_nosp/lexicon.txt $src_gmm_dir/final.mdl $src_tree_dir/tree" for f in $required_files; do @@ -117,7 +117,7 @@ local/online/run_nnet2_common.sh --stage $stage \ --ivector-dim 100 \ --nnet-affix "$nnet_affix" \ --mfcc-config $srcdir/conf/mfcc_hires.conf \ - --extractor $srcdir/exp/nnet3/extractor || exit 1; + --extractor $src_extractor_dir || exit 1; if [ $stage -le 4 ]; then echo "$0: Generate alignment using source model." @@ -164,7 +164,7 @@ if [ $stage -le 8 ]; then --cmd "$decode_cmd" \ --trainer.input-model $dir/input.raw \ --feat.online-ivector-dir exp/nnet2${nnet_affix}/ivectors \ - --chain.xent-regularize $xent_regularize \ + --chain.xent-regularize 0.1 \ --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ --chain.xent-regularize 0.1 \ --chain.leaky-hmm-coefficient 0.1 \ diff --git a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1c.sh b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1c.sh index 3a2b04a401c..d1df31b93ac 100755 --- a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1c.sh +++ b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1c.sh @@ -37,20 +37,20 @@ frames_per_chunk=150 num_epochs=2 initial_effective_lrate=0.005 final_effective_lrate=0.0005 -leftmost_questions_truncate=-1 max_param_change=2.0 -final_layer_normalize_target=0.5 num_jobs_initial=2 num_jobs_final=4 -minibatch_size=32 +minibatch_size=128 frames_per_eg=150 remove_egs=false xent_regularize=0.1 # configs for transfer learning common_egs_dir= -srcdir=../../wsj/s5 # base directory for source data -src_tdnn_affix=1d +srcdir=../../wsj/s5 # base directory for source data which is the base dir + # for source ivector extractor, scr chain model and + # lang dir. +src_tdnn_affix=1d # tdnn affix used for src chain model. primary_lr_factor=0.25 # learning-rate factor for all except last layer in transferred source model nnet_affix=_online_wsj @@ -78,13 +78,6 @@ fi # nnet3 setup, and you can skip them by setting "--stage 8" if you have already # run those things. -lang=data/lang_chain_5n_wsj -lang_src_tgt=data/lang_wsj_rm # This dir is prepared using phones.txt and lexicon from - # source(WSJ) and and wordlist and G.fst from target(RM) -ali_dir=exp/chain/chain_ali_wsj -treedir=exp/chain/tri4_5n_tree_wsj -lat_dir=exp/chain_lats${src_tree_dir:+_wsj} - # src directories src_extractor_dir=$srcdir/exp/nnet3/extractor src_mdl=$srcdir/exp/chain/tdnn${src_tdnn_affix}_sp/final.mdl # input dnn model for source data @@ -99,6 +92,13 @@ src_tree_dir=$srcdir/exp/chain/tree_a_sp # chain tree-dir for source dataset; # the alignment in target domain is # converted using src-tree +# dirs for src-to-tgt transfer experiment +lang=data/lang_chain_5n_wsj +lang_src_tgt=data/lang_wsj_rm # This dir is prepared using phones.txt and lexicon from + # source(WSJ) and and wordlist and G.fst from target(RM) +ali_dir=exp/chain/chain_ali_wsj +lat_dir=exp/chain_lats${src_tree_dir:+_wsj} + required_files="$src_mdl $src_extractor_dir/final.dubm $src_extractor_dir/final.mat $src_extractor_dir/final.ie $src_lang/phones.txt $srcdir/data/local/dict_nosp/lexicon.txt $src_tree_dir/tree" for f in $required_files; do @@ -121,14 +121,14 @@ local/online/run_nnet2_common.sh --stage $stage \ --ivector-dim 100 \ --nnet-affix "$nnet_affix" \ --mfcc-config $srcdir/conf/mfcc_hires.conf \ - --extractor $srcdir/exp/nnet3/extractor || exit 1; + --extractor $src_extractor_dir || exit 1; src_mdl_dir=`dirname $src_mdl` if [ $stage -le 4 ]; then echo "$0: Generate alignment using source chain model." steps/nnet3/align.sh --nj 100 --cmd "$train_cmd" \ --online-ivector-dir exp/nnet2${nnet_affix}/ivectors \ --extra-left-context-initial 0 --extra-right-context-final 0 \ - --scale-opts "--transition-scale=1.0 --acoustic-scale=1.0 --self-loop-scale=1." \ + --scale-opts "--transition-scale=1.0 --acoustic-scale=1.0 --self-loop-scale=1.0" \ --frames-per-chunk $frames_per_chunk \ data/train_hires $lang_src_tgt $src_mdl_dir $ali_dir || exit 1; fi @@ -139,7 +139,7 @@ if [ $stage -le 5 ]; then steps/nnet3/align_lats.sh --nj 100 --cmd "$train_cmd" \ --acoustic-scale 1.0 --extra-left-context-initial 0 --extra-right-context-final 0 \ --frames-per-chunk $frames_per_chunk \ - --scale-opts "--transition-scale=1.0 --self-loop-scale=1." \ + --scale-opts "--transition-scale=1.0 --self-loop-scale=1.0" \ --online-ivector-dir exp/nnet2${nnet_affix}/ivectors data/train_hires \ $lang_src_tgt $src_mdl_dir $lat_dir || exit 1; rm $lat_dir/fsts.*.gz # save space From b92a63a0d1edf787a1bf61c14c834c9eb91b548b Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Thu, 17 Aug 2017 18:32:12 -0400 Subject: [PATCH 042/174] semisup: Adding some extra script for semi-supervised recipes --- .../s5/local/fisher_create_test_lang.sh | 2 + .../s5/local/fisher_train_lms.sh | 2 + .../s5/local/nnet3/run_ivector_common.sh | 66 ++-- .../s5/local/nnet3/run_ivector_common_pca.sh | 13 +- .../semisup/chain/tuning/run_tdnn_11k.sh | 4 +- .../run_tdnn_11k_semisupervised_conf_a.sh | 164 ++++---- .../run_tdnn_11k_semisupervised_conf_b.sh | 358 +++++++++++++++++ .../run_tdnn_11k_semisupervised_conf_c.sh | 359 +++++++++++++++++ .../run_tdnn_11k_semisupervised_conf_d.sh | 374 ++++++++++++++++++ .../run_tdnn_11k_semisupervised_conf_e.sh | 374 ++++++++++++++++++ .../run_tdnn_11k_semisupervised_conf_f.sh | 374 ++++++++++++++++++ .../run_tdnn_11k_semisupervised_conf_g.sh | 374 ++++++++++++++++++ .../semisup/chain/tuning/run_tdnn_oracle.sh | 198 ++++++++++ .../s5/local/semisup/run_15k.sh | 10 +- .../chain/build_tree_multiple_sources.sh | 76 ++-- egs/wsj/s5/steps/nnet3/chain/get_egs.sh | 2 + egs/wsj/s5/steps/subset_ali_dir.sh | 9 + 17 files changed, 2604 insertions(+), 155 deletions(-) create mode 100644 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_b.sh create mode 100644 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_c.sh create mode 100644 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_d.sh create mode 100644 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_e.sh create mode 100644 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_f.sh create mode 100644 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_g.sh create mode 100755 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_oracle.sh diff --git a/egs/fisher_english/s5/local/fisher_create_test_lang.sh b/egs/fisher_english/s5/local/fisher_create_test_lang.sh index 1d7c4013b83..533a0949962 100755 --- a/egs/fisher_english/s5/local/fisher_create_test_lang.sh +++ b/egs/fisher_english/s5/local/fisher_create_test_lang.sh @@ -44,5 +44,7 @@ fsttablecompose data/lang_test/L_disambig.fst data/lang_test/G.fst | \ fsttablecompose data/lang/L_disambig.fst data/lang_test/G.fst | \ fstisstochastic || echo "[log:] LG is not stochastic" +utils/build_const_arpa_lm.sh data/local/lm/4gram-mincount/lm_unpruned.gz \ + data/lang_test data/lang_test_fg echo "$0 succeeded" diff --git a/egs/fisher_english/s5/local/fisher_train_lms.sh b/egs/fisher_english/s5/local/fisher_train_lms.sh index 881d3ce9466..585680550f8 100755 --- a/egs/fisher_english/s5/local/fisher_train_lms.sh +++ b/egs/fisher_english/s5/local/fisher_train_lms.sh @@ -70,6 +70,8 @@ cat $cleantext | awk -v wmap=$dir/word_map 'BEGIN{while((getline0)map[$1] train_lm.sh --arpa --lmtype 3gram-mincount $dir || exit 1; +train_lm.sh --arpa --lmtype 4gram-mincount $dir || exit 1; + # Perplexity over 88307.000000 words (excluding 691.000000 OOVs) is 71.241332 # note: output is diff --git a/egs/fisher_english/s5/local/nnet3/run_ivector_common.sh b/egs/fisher_english/s5/local/nnet3/run_ivector_common.sh index 6505381b03f..b1285de008f 100755 --- a/egs/fisher_english/s5/local/nnet3/run_ivector_common.sh +++ b/egs/fisher_english/s5/local/nnet3/run_ivector_common.sh @@ -6,8 +6,9 @@ stage=1 generate_alignments=true # false if doing chain training speed_perturb=true train_set=train - lda_train_set=train_100k +extractor= # ivector-extractor. + # If provided, will be used instead of training a new one. nnet3_affix= gmm=tri2_ali # should also contain alignments for $lda_train_set @@ -94,37 +95,42 @@ for line in sys.stdin.readlines(): steps/compute_cmvn_stats.sh data/${dataset}_hires exp/make_hires/$dataset $mfccdir; utils/fix_data_dir.sh data/${dataset}_hires # remove segments with problems done - - # Take the first 30k utterances (about 1/8th of the data) this will be used - # for the diagubm training - utils/subset_data_dir.sh --first data/${train_set}_hires 30000 data/${train_set}_30k_hires - utils/data/remove_dup_utts.sh 200 data/${train_set}_30k_hires data/${train_set}_30k_nodup_hires # 33hr fi -# ivector extractor training -if [ $stage -le 4 ]; then - # We need to build a small system just because we need the LDA+MLLT transform - # to train the diag-UBM on top of. We use --num-iters 13 because after we get - # the transform (12th iter is the last), any further training is pointless. - # this decision is based on fisher_english - steps/train_lda_mllt.sh --cmd "$train_cmd" --num-iters 13 \ - --splice-opts "--left-context=3 --right-context=3" \ - 5500 90000 data/${lda_train_set}_hires \ - data/lang $gmm_dir exp/nnet3${nnet3_affix}/tri3a -fi +if [ -z "$extractor" ]; then + if [ $stage -le 3 ]; then + # Take the first 30k utterances (about 1/8th of the data) this will be used + # for the diagubm training + utils/subset_data_dir.sh --first data/${train_set}_hires 30000 data/${train_set}_30k_hires + utils/data/remove_dup_utts.sh 200 data/${train_set}_30k_hires data/${train_set}_30k_nodup_hires # 33hr + fi -if [ $stage -le 5 ]; then - # To train a diagonal UBM we don't need very much data, so use the smallest subset. - steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 --num-frames 200000 \ - data/${train_set}_30k_nodup_hires 512 exp/nnet3${nnet3_affix}/tri3a exp/nnet3${nnet3_affix}/diag_ubm -fi + # ivector extractor training + if [ $stage -le 4 ]; then + # We need to build a small system just because we need the LDA+MLLT transform + # to train the diag-UBM on top of. We use --num-iters 13 because after we get + # the transform (12th iter is the last), any further training is pointless. + # this decision is based on fisher_english + steps/train_lda_mllt.sh --cmd "$train_cmd" --num-iters 13 \ + --splice-opts "--left-context=3 --right-context=3" \ + 5500 90000 data/${lda_train_set}_hires \ + data/lang $gmm_dir exp/nnet3${nnet3_affix}/tri3a + fi + + if [ $stage -le 5 ]; then + # To train a diagonal UBM we don't need very much data, so use the smallest subset. + steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 --num-frames 200000 \ + data/${train_set}_30k_nodup_hires 512 exp/nnet3${nnet3_affix}/tri3a exp/nnet3${nnet3_affix}/diag_ubm + fi -if [ $stage -le 6 ]; then - # iVector extractors can be sensitive to the amount of data, but this one has a - # fairly small dim (defaults to 100) so we don't use all of it, we use just the - # 100k subset (just under half the data). - steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \ - data/${lda_train_set}_hires exp/nnet3${nnet3_affix}/diag_ubm exp/nnet3${nnet3_affix}/extractor || exit 1; + if [ $stage -le 6 ]; then + # iVector extractors can be sensitive to the amount of data, but this one has a + # fairly small dim (defaults to 100) so we don't use all of it, we use just the + # 100k subset (just under half the data). + steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \ + data/${lda_train_set}_hires exp/nnet3${nnet3_affix}/diag_ubm exp/nnet3${nnet3_affix}/extractor || exit 1; + fi + extractor=exp/nnet3${nnet3_affix}/extractor fi if [ $stage -le 7 ]; then @@ -136,11 +142,11 @@ if [ $stage -le 7 ]; then steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2 data/${train_set}_hires data/${train_set}_max2_hires steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \ - data/${train_set}_max2_hires exp/nnet3${nnet3_affix}/extractor exp/nnet3${nnet3_affix}/ivectors_${train_set}_hires || exit 1; + data/${train_set}_max2_hires $extractor `basename $extractor`/ivectors_${train_set}_hires || exit 1; for dataset in test dev; do steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \ - data/${dataset}_hires exp/nnet3${nnet3_affix}/extractor exp/nnet3${nnet3_affix}/ivectors_${dataset}_hires || exit 1; + data/${dataset}_hires $extractor `basename $extractor`/ivectors_${dataset}_hires || exit 1; done fi diff --git a/egs/fisher_english/s5/local/nnet3/run_ivector_common_pca.sh b/egs/fisher_english/s5/local/nnet3/run_ivector_common_pca.sh index e98fd479244..e159781e9a1 100755 --- a/egs/fisher_english/s5/local/nnet3/run_ivector_common_pca.sh +++ b/egs/fisher_english/s5/local/nnet3/run_ivector_common_pca.sh @@ -5,9 +5,11 @@ set -e stage=1 speed_perturb=true train_set=train -ivector_train_set=train +ivector_train_set= # data set for training i-vector extractor. + # If not provided, train_set will be used. nnet3_affix= +exp=exp . ./path.sh . ./utils/parse_options.sh @@ -31,6 +33,9 @@ if [ "$speed_perturb" == "true" ]; then done fi train_set=${train_set}_sp + if ! [ -z "$ivector_train_set" ]; then + ivector_train_set=${ivector_train_set}_sp + fi fi if [ $stage -le 3 ]; then @@ -63,6 +68,10 @@ if [ $stage -le 3 ]; then done fi +if [ -z "$ivector_train_set" ]; then + ivector_train_set=$train_set +fi + # ivector extractor training if [ $stage -le 4 ]; then steps/online/nnet2/get_pca_transform.sh --cmd "$train_cmd" \ @@ -92,7 +101,9 @@ if [ $stage -le 7 ]; then steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \ data/${ivector_train_set}_max2_hires $exp/nnet3${nnet3_affix}/extractor $exp/nnet3${nnet3_affix}/ivectors_${ivector_train_set}_hires || exit 1; +fi +if [ $stage -le 8 ]; then for dataset in test dev; do steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \ data/${dataset}_hires $exp/nnet3${nnet3_affix}/extractor $exp/nnet3${nnet3_affix}/ivectors_${dataset}_hires || exit 1; diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k.sh index ecbddef1b28..81335e5ae5b 100755 --- a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k.sh +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k.sh @@ -1,7 +1,7 @@ #!/bin/bash set -e -# Based on run_tdnn_7b.sh in the fisher swbd recipe +# This is fisher chain recipe for training a model on a subset of around 10 hours. # configs for 'chain' stage=0 @@ -52,7 +52,7 @@ lang=data/lang_chain # nnet3 setup, and you can skip them by setting "--stage 8" if you have already # run those things. -local/nnet3/run_ivector_common_pca.sh --stage $stage \ +local/nnet3/run_ivector_common_pca.sh --stage $stage --exp $exp \ --speed-perturb true \ --train-set $train_set \ --ivector-train-set $ivector_train_set \ diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_a.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_a.sh index 7e81a4a985b..60f64dee299 100644 --- a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_a.sh +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_a.sh @@ -36,6 +36,9 @@ sup_egs_dir= unsup_egs_dir= tree_affix= +extra_left_context=0 +extra_right_context=0 + xent_regularize=0.1 hidden_dim=725 minibatch_size=128 @@ -124,16 +127,17 @@ cmvn_opts=`cat $chaindir/cmvn_opts` || exit 1 sup_ali_dir=$exp/tri3 -treedir=exp/chain${nnet3_affix}/tree_${tree_affix} +treedir=$exp/chain${nnet3_affix}/tree_${tree_affix} if [ $stage -le 9 ]; then steps/subset_ali_dir.sh --cmd "$train_cmd" \ data/${unsupervised_set} data/${unsupervised_set}_sp_hires \ $chaindir/best_path_${unsupervised_set}_sp${decode_affix} \ $chaindir/best_path_${unsupervised_set}${decode_affix} + echo $frame_subsampling_factor > $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor steps/nnet3/chain/build_tree_multiple_sources.sh \ --frame-subsampling-factor $frame_subsampling_factor \ - --leftmost-questions-truncate -1 \ + --use-fmllr false \ --cmd "$train_cmd" 10000 data/lang_chain \ data/${supervised_set} $sup_ali_dir \ data/${unsupervised_set} \ @@ -141,30 +145,86 @@ if [ $stage -le 9 ]; then $treedir fi -exit 1 - -dir=exp/chain${nnet3_affix}/tdnn${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} +dir=$exp/chain${nnet3_affix}/tdnn${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} -unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} if [ $stage -le 10 ]; then steps/nnet3/chain/make_den_fst.sh --weights $lm_weights --cmd "$train_cmd" \ ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ $dir fi +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output-1 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + output-layer name=output-1-xent input=prefinal-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ + cp $dir/configs/final.config{,.orig} + + cat $dir/configs/final.config.orig | \ + perl -pe 's/component=output-1.affine/component=output-0.affine/g; + s/component=output-1-xent.affine/component=output-0-xent.affine/g;' > \ + $dir/configs/final.config +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + supervised_set=${supervised_set}_sp sup_lat_dir=$exp/chain${nnet3_affix}/tri3_${supervised_set}_lats - if [ -z "$sup_egs_dir" ]; then sup_egs_dir=$dir/egs_${supervised_set} - - left_context=`cat $chaindir/egs/info/left_context` - right_context=`cat $chaindir/egs/info/right_context` - left_context_initial=`cat $chaindir/egs/info/left_context_initial` - right_context_final=`cat $chaindir/egs/info/right_context_final` frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) - if [ $stage -le 11 ]; then + if [ $stage -le 12 ]; then if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then utils/create_split_dir.pl \ /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage @@ -180,25 +240,23 @@ if [ -z "$sup_egs_dir" ]; then --frames-per-eg $frames_per_eg \ --frames-per-iter 1500000 \ --cmvn-opts "$cmvn_opts" \ - --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ --generate-egs-scp true \ data/${supervised_set}_hires $dir \ $sup_lat_dir $sup_egs_dir fi else - left_context=`cat $sup_egs_dir/info/left_context` - right_context=`cat $sup_egs_dir/info/right_context` - left_context_initial=`cat $sup_egs_dir/info/left_context_initial` - right_context_final=`cat $sup_egs_dir/info/right_context_final` frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) fi +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} [ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg if [ -z "$unsup_egs_dir" ]; then unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} - if [ $stage -le 12 ]; then + if [ $stage -le 13 ]; then if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then utils/create_split_dir.pl \ /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage @@ -216,16 +274,16 @@ if [ -z "$unsup_egs_dir" ]; then --lattice-prune-beam "$lattice_prune_beam" \ --phone-insertion-penalty "$phone_insertion_penalty" \ --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ - --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ --generate-egs-scp true \ - data/${unsupervised_set}_hires $chaindir \ + data/${unsupervised_set}_hires $dir \ $unsup_lat_dir $unsup_egs_dir fi fi comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi -if [ $stage -le 13 ]; then +if [ $stage -le 14 ]; then steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ --minibatch-size 128 --frames-per-iter 1500000 \ --lang2weight $supervision_weights --egs-prefix cegs. 2 \ @@ -233,68 +291,16 @@ if [ $stage -le 13 ]; then touch $comb_egs_dir/.nodelete # keep egs around when that run dies. fi -if [ $stage -le 13 ]; then - echo "$0: creating neural net configs using the xconfig parser"; - - num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') - learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) - - mkdir -p $dir/configs - cat < $dir/configs/network.xconfig - input dim=100 name=ivector - input dim=40 name=input - - # please note that it is important to have input layer with the name=input - # as the layer immediately preceding the fixed-affine-layer to enable - # the use of short notation for the descriptor - fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat - - # the first splicing is moved before the lda layer, so no splicing here - relu-batchnorm-layer name=tdnn1 dim=$hidden_dim - relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim - relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim - relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim - relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim - relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim - - ## adding the layers for chain branch - relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 - output-layer name=output-0 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 - output-layer name=output-1 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 - output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 - - # adding the layers for xent branch - # This block prints the configs for a separate output that will be - # trained with a cross-entropy objective in the 'chain' models... this - # has the effect of regularizing the hidden parts of the model. we use - # 0.5 / args.xent_regularize as the learning rate factor- the factor of - # 0.5 / args.xent_regularize is suitable as it means the xent - # final-layer learns at a rate independent of the regularization - # constant; and the 0.5 was tuned so as to make the relative progress - # similar in the xent and regular final layers. - relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 - output-layer name=output-0-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 - output-layer name=output-1-xent input=prefinal-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 -EOF - - steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ - cp $dir/configs/final.config{,.orig} - - cat $dir/configs/final.config.orig | \ - perl -pe 's/component=output-1.affine/component=output-0.affine/g; - s/component=output-1-xent.affine/component=output-0-xent.affine/g;' > \ - $dir/configs/final.config -fi if [ $train_stage -le -4 ]; then train_stage=-4 fi -if [ $stage -le 14 ]; then +if [ $stage -le 15 ]; then steps/nnet3/chain/train.py --stage $train_stage \ --egs.dir "$comb_egs_dir" \ --cmd "$decode_cmd" \ - --feat.online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ --chain.xent-regularize $xent_regularize \ --chain.leaky-hmm-coefficient 0.1 \ @@ -319,14 +325,14 @@ if [ $stage -le 14 ]; then fi graph_dir=$dir/graph -if [ $stage -le 15 ]; then +if [ $stage -le 17 ]; then # Note: it might appear that this $lang directory is mismatched, and it is as # far as the 'topo' is concerned, but this script doesn't read the 'topo' from # the lang directory. utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir fi -if [ $stage -le 16 ]; then +if [ $stage -le 18 ]; then iter_opts= if [ ! -z $decode_iter ]; then nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ @@ -343,7 +349,7 @@ if [ $stage -le 16 ]; then num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ - --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; ) & done diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_b.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_b.sh new file mode 100644 index 00000000000..f106549167f --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_b.sh @@ -0,0 +1,358 @@ +#!/bin/bash + +# This script is same as _f, but fixes the bug about acwt for best path. + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup11k_250k # for reference +exp=exp/semisup_11k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup11k +semi_affix=semi11k_250k # affix relating train-set splitting proportion + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.0 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=2.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=2 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1b # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,0.3 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix= + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 +# to tune: +# frames_per_eg for unsupervised + +decode_iter= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +tree_affix=${tree_affix}_${semi_affix} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor + + steps/nnet3/chain/build_tree_multiple_sources.sh \ + --frame-subsampling-factor $frame_subsampling_factor \ + --use-fmllr false \ + --cmd "$train_cmd" 10000 data/lang_chain \ + data/${supervised_set} $sup_ali_dir \ + data/${unsupervised_set} \ + $chaindir/best_path_${unsupervised_set}${decode_affix} \ + $treedir +fi + +dir=$exp/chain${nnet3_affix}/tdnn${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_den_fst.sh --weights $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output-1 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + output-layer name=output-1-xent input=prefinal-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ + cp $dir/configs/final.config{,.orig} + + cat $dir/configs/final.config.orig | \ + perl -pe 's/component=output-1.affine/component=output-0.affine/g; + s/component=output-1-xent.affine/component=output-0-xent.affine/g;' > \ + $dir/configs/final.config +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri3_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=128/300=64" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_c.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_c.sh new file mode 100644 index 00000000000..60f64dee299 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_c.sh @@ -0,0 +1,359 @@ +#!/bin/bash + +# This script is same as _f, but fixes the bug about acwt for best path. + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup11k_250k # for reference +exp=exp/semisup_11k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup11k +semi_affix=semi11k_250k # affix relating train-set splitting proportion + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.0 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=2.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=2 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1a # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,0.3 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix= + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 +# to tune: +# frames_per_eg for unsupervised + +decode_iter= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +tree_affix=${tree_affix}_${semi_affix} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor + + steps/nnet3/chain/build_tree_multiple_sources.sh \ + --frame-subsampling-factor $frame_subsampling_factor \ + --use-fmllr false \ + --cmd "$train_cmd" 10000 data/lang_chain \ + data/${supervised_set} $sup_ali_dir \ + data/${unsupervised_set} \ + $chaindir/best_path_${unsupervised_set}${decode_affix} \ + $treedir +fi + +dir=$exp/chain${nnet3_affix}/tdnn${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_den_fst.sh --weights $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output-1 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + output-layer name=output-1-xent input=prefinal-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ + cp $dir/configs/final.config{,.orig} + + cat $dir/configs/final.config.orig | \ + perl -pe 's/component=output-1.affine/component=output-0.affine/g; + s/component=output-1-xent.affine/component=output-0-xent.affine/g;' > \ + $dir/configs/final.config +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri3_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=128/300=64" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_d.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_d.sh new file mode 100644 index 00000000000..780c783c87f --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_d.sh @@ -0,0 +1,374 @@ +#!/bin/bash + +# This script is same as _f, but fixes the bug about acwt for best path. + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup11k_250k # for reference +exp=exp/semisup_11k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup11k +semi_affix=semi11k_250k # affix relating train-set splitting proportion + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.0 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=2.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=2 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1d # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,0.3 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix=fg + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 +# to tune: +# frames_per_eg for unsupervised + +decode_iter= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +tree_affix=${tree_affix}_${semi_affix} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor + + steps/nnet3/chain/build_tree_multiple_sources.sh \ + --frame-subsampling-factor $frame_subsampling_factor \ + --use-fmllr false \ + --cmd "$train_cmd" 10000 data/lang_chain \ + data/${supervised_set} $sup_ali_dir \ + data/${unsupervised_set} \ + $chaindir/best_path_${unsupervised_set}${decode_affix} \ + $treedir +fi + +dir=$exp/chain${nnet3_affix}/tdnn${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_den_fst.sh --weights $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output-1 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + output-layer name=output-1-xent input=prefinal-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ + cp $dir/configs/final.config{,.orig} + + cat $dir/configs/final.config.orig | \ + perl -pe 's/component=output-1.affine/component=output-0.affine/g; + s/component=output-1-xent.affine/component=output-0-xent.affine/g;' > \ + $dir/configs/final.config +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri3_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=128/300=64" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_e.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_e.sh new file mode 100644 index 00000000000..9f2a2a8993b --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_e.sh @@ -0,0 +1,374 @@ +#!/bin/bash + +# This script is same as _f, but fixes the bug about acwt for best path. + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup11k_250k # for reference +exp=exp/semisup_11k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup11k +semi_affix=semi11k_250k # affix relating train-set splitting proportion + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.0 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=2.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=2 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1e # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix=fg + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 +# to tune: +# frames_per_eg for unsupervised + +decode_iter= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +tree_affix=${tree_affix}_${semi_affix} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor + + steps/nnet3/chain/build_tree_multiple_sources.sh \ + --frame-subsampling-factor $frame_subsampling_factor \ + --use-fmllr false \ + --cmd "$train_cmd" 10000 data/lang_chain \ + data/${supervised_set} $sup_ali_dir \ + data/${unsupervised_set} \ + $chaindir/best_path_${unsupervised_set}${decode_affix} \ + $treedir +fi + +dir=$exp/chain${nnet3_affix}/tdnn${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_den_fst.sh --weights $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output-1 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + output-layer name=output-1-xent input=prefinal-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ + cp $dir/configs/final.config{,.orig} + + cat $dir/configs/final.config.orig | \ + perl -pe 's/component=output-1.affine/component=output-0.affine/g; + s/component=output-1-xent.affine/component=output-0-xent.affine/g;' > \ + $dir/configs/final.config +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri3_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=128/300=64" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_f.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_f.sh new file mode 100644 index 00000000000..346c5e6eede --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_f.sh @@ -0,0 +1,374 @@ +#!/bin/bash + +# This script is same as _e, but is run for 3 epochs instead of 4. + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup11k_250k # for reference +exp=exp/semisup_11k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup11k +semi_affix=semi11k_250k # affix relating train-set splitting proportion + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.0 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=2.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=2 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1f # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix=fg + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 +# to tune: +# frames_per_eg for unsupervised + +decode_iter= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +tree_affix=${tree_affix}_${semi_affix} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor + + steps/nnet3/chain/build_tree_multiple_sources.sh \ + --frame-subsampling-factor $frame_subsampling_factor \ + --use-fmllr false \ + --cmd "$train_cmd" 10000 data/lang_chain \ + data/${supervised_set} $sup_ali_dir \ + data/${unsupervised_set} \ + $chaindir/best_path_${unsupervised_set}${decode_affix} \ + $treedir +fi + +dir=$exp/chain${nnet3_affix}/tdnn${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_den_fst.sh --weights $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output-1 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + output-layer name=output-1-xent input=prefinal-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ + cp $dir/configs/final.config{,.orig} + + cat $dir/configs/final.config.orig | \ + perl -pe 's/component=output-1.affine/component=output-0.affine/g; + s/component=output-1-xent.affine/component=output-0-xent.affine/g;' > \ + $dir/configs/final.config +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri3_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=128/300=64" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 3 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_g.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_g.sh new file mode 100644 index 00000000000..ccca9c6d334 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_g.sh @@ -0,0 +1,374 @@ +#!/bin/bash + +# This script is same as _e, but is run for 3 epochs instead of 4. + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup11k_250k # for reference +exp=exp/semisup_11k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup11k +semi_affix=semi11k_250k # affix relating train-set splitting proportion + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg=300 # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.0 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=2.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=2 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1g # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix=fg + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 +# to tune: +# frames_per_eg for unsupervised + +decode_iter= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +tree_affix=${tree_affix}_${semi_affix} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor + + steps/nnet3/chain/build_tree_multiple_sources.sh \ + --frame-subsampling-factor $frame_subsampling_factor \ + --use-fmllr false \ + --cmd "$train_cmd" 10000 data/lang_chain \ + data/${supervised_set} $sup_ali_dir \ + data/${unsupervised_set} \ + $chaindir/best_path_${unsupervised_set}${decode_affix} \ + $treedir +fi + +dir=$exp/chain${nnet3_affix}/tdnn${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_den_fst.sh --weights $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output-1 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + output-layer name=output-1-xent input=prefinal-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ + cp $dir/configs/final.config{,.orig} + + cat $dir/configs/final.config.orig | \ + perl -pe 's/component=output-1.affine/component=output-0.affine/g; + s/component=output-1-xent.affine/component=output-0-xent.affine/g;' > \ + $dir/configs/final.config +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri3_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=128/300=64" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 3 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_oracle.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_oracle.sh new file mode 100755 index 00000000000..aa0e433c526 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_oracle.sh @@ -0,0 +1,198 @@ +#!/bin/bash +set -e + +# Based on run_tdnn_7b.sh in the fisher swbd recipe + +# configs for 'chain' +stage=0 +tdnn_affix=7b_oracle +train_stage=-10 +get_egs_stage=-10 +decode_iter= +train_set=train_sup11k +ivector_train_set=semisup11k_250k +tree_affix= +nnet3_affix=_semi11k_250k +chain_affix=_semi11k_250k +exp=exp/semisup_11k +gmm=tri3 +xent_regularize=0.1 +hidden_dim=725 + +# training options +num_epochs=4 +remove_egs=false +common_egs_dir= +minibatch_size=128 + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +ali_dir=${gmm_dir}_ali_${train_set} +if [ $stage -le 11 ]; then + steps/align_fmllr.sh --cmd "$train_cmd" --nj 40 \ + data/${train_set} data/lang $gmm_dir $ali_dir || exit 1 + + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate -1 \ + --cmd "$train_cmd" 11000 data/${train_set} $lang $ali_dir $treedir || exit 1 +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$common_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize 0.1 \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +decode_suff= +if [ $stage -le 15 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}${decode_suff} || exit 1; + ) & + done +fi +wait; +exit 0; diff --git a/egs/fisher_english/s5/local/semisup/run_15k.sh b/egs/fisher_english/s5/local/semisup/run_15k.sh index 2be45d954d6..7d5a2589a21 100644 --- a/egs/fisher_english/s5/local/semisup/run_15k.sh +++ b/egs/fisher_english/s5/local/semisup/run_15k.sh @@ -55,7 +55,6 @@ steps/train_sat.sh --cmd "$train_cmd" \ )& utils/combine_data.sh data/semisup15k_250k data/train_sup15k data/train_unsup250k || exit 1 -} local/semisup/chain/tuning/run_tdnn_11k.sh \ --train-set train_sup15k \ @@ -64,3 +63,12 @@ local/semisup/chain/tuning/run_tdnn_11k.sh \ --stage $stage --train-stage $train_stage \ --exp $exp \ --ivector-train-set semisup15k_250k || exit 1 +} + +local/semisup/chain/tuning/run_tdnn_oracle.sh \ + --train-set semisup15k_250k \ + --nnet3-affix _semi15k_250k \ + --chain-affix _semi15k_250k_oracle \ + --stage 9 --train-stage $train_stage \ + --exp $exp \ + --ivector-train-set semisup15k_250k || exit 1 diff --git a/egs/wsj/s5/steps/nnet3/chain/build_tree_multiple_sources.sh b/egs/wsj/s5/steps/nnet3/chain/build_tree_multiple_sources.sh index 79c1f654ee4..6892a2ff1ee 100755 --- a/egs/wsj/s5/steps/nnet3/chain/build_tree_multiple_sources.sh +++ b/egs/wsj/s5/steps/nnet3/chain/build_tree_multiple_sources.sh @@ -1,16 +1,10 @@ #!/bin/bash # Copyright 2012-2015 Johns Hopkins University (Author: Daniel Povey). +# 2017 Vimal Manohar # Apache 2.0. - -# This script builds a tree for use in the 'chain' systems (although the script -# itself is pretty generic and doesn't use any 'chain' binaries). This is just -# like the first stages of a standard system, like 'train_sat.sh', except it -# does 'convert-ali' to convert alignments to a monophone topology just created -# from the 'lang' directory (in case the topology is different from where you -# got the system's alignments from), and it stops after the tree-building and -# model-initialization stage, without re-estimating the Gaussians or training -# the transitions. +# This script is similar to steps/nnet3/chain/build_tree.sh but supports +# getting statistics from multiple alignment sources. # Begin configuration section. @@ -19,13 +13,11 @@ exit_stage=-100 # you can use this to require it to exit at the # beginning of a specific stage. Not all values are # supported. cmd=run.pl -use_fmllr=true +use_fmllr=true # If true, fmllr transforms will be applied from the alignment directories. + # Otherwise, no fmllr will be applied even if alignment directory contains trans.* context_opts= # e.g. set this to "--context-width 5 --central-position 2" for quinphone. cluster_thresh=-1 # for build-tree control final bottom-up clustering of leaves -frame_subsampling_factor=1 -leftmost_questions_truncate=-1 # note: this used to default to 10, but we never - # use this option now with value != -1, and - # we're changing the default +frame_subsampling_factor=1 # frame subsampling factor of output w.r.t. to the input features tree_stats_opts= cluster_phones_opts= repeat_frames=false @@ -38,7 +30,7 @@ echo "$0 $@" # Print the command line for logging if [ $# -lt 5 ]; then echo "Usage: steps/nnet3/chain/build_tree_multiple_sources.sh <#leaves> [ ... ] " - echo " e.g.: steps/nnet3/chain/build_tree_multiple_sources.sh 15000 data/train_semi data/lang data/train_sup:exp/tri3_ali data/train_unsup:exp/tri3/best_path_train_unsup exp/tree_semi" + echo " e.g.: steps/nnet3/chain/build_tree_multiple_sources.sh 15000 data/lang data/train_sup exp/tri3_ali data/train_unsup exp/tri3/best_path_train_unsup exp/tree_semi" echo "Main options (for others, see top of script file)" echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." echo " --config # config containing options" @@ -68,13 +60,13 @@ if (( $num_sys % 2 != 0 )); then exit 1 fi -num_sys=$((num_sys % 2)) +num_sys=$((num_sys / 2)) data=$dir/data_tmp mkdir -p $data mkdir -p $dir -alidir=`echo ${data_and_alidirs[0]} | cut -d: -s -f2` +alidir=`echo ${data_and_alidirs[1]}` datadirs=() alidirs=() @@ -115,14 +107,17 @@ if [ -f $alidir/final.mat ]; then feat_type=lda; else feat_type=delta; fi echo "$0: feature type is $feat_type" feats=() +feats_one=() for n in `seq 0 $[num_sys-1]`; do this_nj=$(cat ${alidirs[$n]}/num_jobs) || exit 1 this_sdata=${datadirs[$n]}/split$this_nj [[ -d $this_sdata && ${datadirs[$n]}/feats.scp -ot $this_sdata ]] || split_data.sh ${datadirs[$n]} $this_nj || exit 1; ## Set up speaker-independent features. case $feat_type in - delta) feats[$n]="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$this_sdata/JOB/utt2spk scp:$this_sdata/JOB/cmvn.scp scp:$this_sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |";; + delta) feats[$n]="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$this_sdata/JOB/utt2spk scp:$this_sdata/JOB/cmvn.scp scp:$this_sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |" + feats_one[$n]="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$this_sdata/1/utt2spk scp:$this_sdata/1/cmvn.scp scp:$this_sdata/1/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |";; lda) feats[$n]="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$this_sdata/JOB/utt2spk scp:$this_sdata/JOB/cmvn.scp scp:$this_sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |" + feats_one[$n]="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$this_sdata/1/utt2spk scp:$this_sdata/1/cmvn.scp scp:$this_sdata/1/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |" cp $alidir/final.mat $dir cp $alidir/full.mat $dir 2>/dev/null ;; @@ -130,18 +125,20 @@ for n in `seq 0 $[num_sys-1]`; do esac if $use_fmllr; then - if [ ! -f $this_alidir/trans.1 ]; then - echo "$0: Could not find fMLLR transforms in $this_alidir" + if [ ! -f ${alidirs[$n]}/trans.1 ]; then + echo "$0: Could not find fMLLR transforms in ${alidirs[$n]}" exit 1 fi - echo "$0: Using transforms from $this_alidir" - feats[i]="${feats[i]} transform-feats --utt2spk=ark:$this_sdata/JOB/utt2spk ark,s,cs:$this_alidir/trans.JOB ark:- ark:- |" + echo "$0: Using transforms from ${alidirs[$n]}" + feats[i]="${feats[i]} transform-feats --utt2spk=ark:$this_sdata/JOB/utt2spk ark,s,cs:${alidirs[$n]}/trans.JOB ark:- ark:- |" + feats_one[i]="${feats_one[i]} transform-feats --utt2spk=ark:$this_sdata/1/utt2spk ark,s,cs:${alidirs[$n]}/trans.1 ark:- ark:- |" fi # Do subsampling of feats, if needed if [ $frame_subsampling_factor -gt 1 ]; then feats[$n]="${feats[$n]} subsample-feats --n=$frame_subsampling_factor ark:- ark:- |" + feats_one[$n]="${feats_one[$n]} subsample-feats --n=$frame_subsampling_factor ark:- ark:- |" fi done @@ -159,11 +156,13 @@ if [ $stage -le -5 ]; then fi for n in `seq 0 $[num_sys-1]`; do - copy-feats ${feats[$n]} ark:- - done | \ + copy-feats "${feats_one[$n]}" ark:- + done | copy-feats ark:- ark:$dir/tmp.ark + + $cmd $dir/log/init_mono.log \ gmm-init-mono $shared_phones_opt \ - "--train-feats=subset-feats --n=10 ark:- ark:-|" $lang/topo $feat_dim \ - $dir/mono.mdl $dir/mono.tree 2> $dir/log/init_mono.log || exit 1; + "--train-feats=ark:subset-feats --n=10 ark:$dir/tmp.ark ark:- |" $lang/topo $feat_dim \ + $dir/mono.mdl $dir/mono.tree || exit 1 fi @@ -208,18 +207,9 @@ if [ $stage -le -3 ] && $train_tree; then $lang/phones/sets.int $dir/questions.int || exit 1; cat $lang/phones/extra_questions.int >> $dir/questions.int $cmd $dir/log/compile_questions.log \ - compile-questions --leftmost-questions-truncate=$leftmost_questions_truncate \ + compile-questions \ $context_opts $lang/topo $dir/questions.int $dir/questions.qst || exit 1; - # questions_truncated.int will be needed later on when we build the phone - # language model for 'chain' training. It's a mechanism of keeping the graph - # small. - if [ $leftmost_questions_truncate -gt 0 ]; then - head -n $leftmost_questions_truncate $dir/questions.int > $dir/questions_truncated.int - else - cp $dir/questions.int $dir/questions_truncated.int - fi - echo "$0: Building the tree" $cmd $dir/log/build_tree.log \ build-tree $context_opts --verbose=1 --max-leaves=$numleaves \ @@ -255,22 +245,24 @@ if [ $stage -le -1 ]; then exit 1 fi + echo "$0: frame-subsampling-factor for $this_alidir is $this_frame_subsampling_factor" + this_frame_subsampling_factor=$((frame_subsampling_factor / this_frame_subsampling_factor)) - echo "$0: Converting alignments from $alidir to use current tree" + echo "$0: Converting alignments from $this_alidir to use current tree" $cmd JOB=1:$this_nj $dir/log/convert.$n.JOB.log \ convert-ali --repeat-frames=$repeat_frames \ --frame-subsampling-factor=$this_frame_subsampling_factor \ - $alidir/final.mdl $dir/1.mdl $dir/tree \ - ark,scp:$dir/ali.$n.JOB.ark,$dir/ali.$n.JOB.scp + $this_alidir/final.mdl $dir/1.mdl $dir/tree "ark:gunzip -c $this_alidir/ali.JOB.gz |" \ + ark,scp:$dir/ali.$n.JOB.ark,$dir/ali.$n.JOB.scp || exit 1 for i in `seq $this_nj`; do - cat $dir/ali.$n.$i.scp - done > $dir/ali.$n.scp + cat $dir/ali.$n.$i.scp + done > $dir/ali.$n.scp || exit 1 done for n in `seq 0 $[num_sys-1]`; do cat $dir/ali.$n.scp - done | sort -k1,1 > $dir/ali.scp + done | sort -k1,1 > $dir/ali.scp || exit 1 utils/split_data.sh $data $nj $cmd JOB=1:$nj $dir/log/copy_alignments.JOB.log \ diff --git a/egs/wsj/s5/steps/nnet3/chain/get_egs.sh b/egs/wsj/s5/steps/nnet3/chain/get_egs.sh index f478d3a811a..de3310f8f4b 100755 --- a/egs/wsj/s5/steps/nnet3/chain/get_egs.sh +++ b/egs/wsj/s5/steps/nnet3/chain/get_egs.sh @@ -223,6 +223,8 @@ if [ -f $dir/trans.scp ]; then train_subset_feats="$train_subset_feats transform-feats --utt2spk=ark:$data/utt2spk scp:$dir/trans.scp ark:- ark:- |" fi +tree-info $chaindir/tree | grep num-pdfs | awk '{print $2}' > $dir/info/num_pdfs || exit 1 + if [ ! -z "$online_ivector_dir" ]; then ivector_dim=$(feat-to-dim scp:$online_ivector_dir/ivector_online.scp -) || exit 1; echo $ivector_dim > $dir/info/ivector_dim diff --git a/egs/wsj/s5/steps/subset_ali_dir.sh b/egs/wsj/s5/steps/subset_ali_dir.sh index a17a7fbf196..c086ea39959 100755 --- a/egs/wsj/s5/steps/subset_ali_dir.sh +++ b/egs/wsj/s5/steps/subset_ali_dir.sh @@ -1,5 +1,8 @@ #!/bin/bash +# Copyright 2017 Vimal Manohar +# Apache 2.0. + cmd=run.pl . path.sh @@ -24,6 +27,10 @@ dir=$4 nj=$(cat $ali_dir/num_jobs) || exit 1 utils/split_data.sh $data $nj +mkdir -p $dir +cp $ali_dir/{final.mdl,*.mat,*_opts,tree} $dir/ || true +cp -r $ali_dir/phones $dir 2>/dev/null || true + $cmd JOB=1:$nj $dir/log/copy_alignments.JOB.log \ copy-int-vector "ark:gunzip -c $ali_dir/ali.JOB.gz |" \ ark,scp:$dir/ali_tmp.JOB.ark,$dir/ali_tmp.JOB.scp || exit 1 @@ -43,6 +50,8 @@ $cmd JOB=1:$nj $dir/log/filter_alignments.JOB.log \ "scp:utils/filter_scp.pl $subset_data/split${nj}/JOB/utt2spk $dir/ali_tmp.scp |" \ "ark:| gzip -c > $dir/ali.JOB.gz" || exit 1 +echo $nj > $dir/num_jobs + rm $dir/ali_tmp.*.{ark,scp} $dir/ali_tmp.scp exit 0 From 7a9ef54975ad22146cb1811c974913bcf8d64f0b Mon Sep 17 00:00:00 2001 From: Pegita Date: Thu, 17 Aug 2017 19:48:07 -0400 Subject: [PATCH 043/174] fixed src dirs options for transfer learning scripts 1{a,b,c} and modified scripts to check the need for ivector w.r.t source model. --- .../local/chain/tuning/run_tdnn_wsj_rm_1a.sh | 101 +++++++----- .../local/chain/tuning/run_tdnn_wsj_rm_1b.sh | 133 ++++++++------- .../local/chain/tuning/run_tdnn_wsj_rm_1c.sh | 152 ++++++++++-------- egs/rm/s5/local/online/run_nnet2_common.sh | 6 +- 4 files changed, 224 insertions(+), 168 deletions(-) diff --git a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1a.sh b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1a.sh index e9f2f32828c..b15d904eb20 100755 --- a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1a.sh +++ b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1a.sh @@ -15,25 +15,23 @@ train_stage=-10 get_egs_stage=-10 dir=exp/chain/tdnn_wsj_rm_1a -# training options -num_epochs=2 -initial_effective_lrate=0.005 -final_effective_lrate=0.0005 -max_param_change=2.0 -num_jobs_initial=2 -num_jobs_final=4 -minibatch_size=128 -frames_per_eg=150 -remove_egs=false -xent_regularize=0.1 - # configs for transfer learning -srcdir=../../wsj/s5 # base dir for source dataset. -src_tdnn_affix=1d +src_mdl=../../wsj/s5/exp/chain/tdnn1d_sp/final.mdl # input chain model + # trained on source dataset (wsj). + # This model is transfered to the target domain. + +src_mfcc_config=../../wsj/s5/conf/mfcc_hires.conf # mfcc config used to extract higher dim + # mfcc features used for ivector training + # in source domain. +src_ivec_extractor_dir= # source ivector extractor dir used to extract ivector for + # source data and the ivector for target data is extracted using this extractor. + # It should be nonempty, if ivector is used in source model training. + common_egs_dir= primary_lr_factor=0.25 # The learning-rate factor for transferred layers from source - # model. -dim=450 + # model. e.g. if 0, it fixed the paramters transferred from source. + # The learning-rate factor for new added layers is 1.0. + nnet_affix=_online_wsj # End configuration section. @@ -50,12 +48,24 @@ If you want to use GPUs (and have them), go to src/, and configure and make on a where "nvcc" is installed. EOF fi -src_mdl=$srcdir/exp/chain/tdnn${src_tdnn_affix}_sp/final.mdl # input dnn model for source data - # that is used in transfer learning. -src_extractor_dir=$srcdir/exp/nnet3/extractor # source extractor dir used to extract - # ivector for target data. +required_files="$src_mfcc_config $src_mdl" +use_ivector=false +ivector_dim=$(nnet3-am-info --print-args=false $src_mdl | grep "ivector-dim" | cut -d" " -f2) +if [ "$ivector_dim" == "" ]; then ivector_dim=0 ; fi + +if [ ! -z $src_ivec_extractor_dir ]; then + if [ $ivector_dim -eq 0 ]; then + echo "source ivector extractor dir '$src_ivec_extractor_dir' is specified but ivector is not used in training the source model '$src_mdl'." + else + required_files="$required_files $src_ivec_extractor_dir/final.dubm $src_ivec_extractor_dir/final.mat $src_ivec_extractor_dir/final.ie" + use_ivector=true + fi +else + if [ $ivector_dim -gt 0 ]; then + echo "ivector is used in training the source model '$src_mdl' but no ivector extractor dir for source model specified." && exit 1; + fi +fi -required_files="$src_mdl $src_extractor_dir/final.dubm $src_extractor_dir/final.mat $src_extractor_dir/final.ie" for f in $required_files; do if [ ! -f $f ]; then echo "$0: no such file $f" @@ -63,7 +73,7 @@ for f in $required_files; do done # The iVector-extraction and feature-dumping parts are the same as the standard -# nnet3 setup, and you can skip them by setting "--stage 8" if you have already +# nnet3 setup, and you can skip them by setting "--stage 4" if you have already # run those things. ali_dir=exp/tri3b_ali @@ -71,10 +81,10 @@ treedir=exp/chain/tri4_5n_tree lang=data/lang_chain_5n local/online/run_nnet2_common.sh --stage $stage \ - --ivector-dim 100 \ + --ivector-dim $ivector_dim \ --nnet-affix "$nnet_affix" \ - --mfcc-config $srcdir/conf/mfcc_hires.conf \ - --extractor $src_extractor_dir || exit 1; + --mfcc-config $src_mfcc_config \ + --extractor $src_ivec_extractor_dir || exit 1; if [ $stage -le 4 ]; then # Get the alignments as lattices (gives the chain training more freedom). @@ -101,7 +111,7 @@ fi if [ $stage -le 6 ]; then # Build a tree using our new topology. steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ - --leftmost-questions-truncate $leftmost_questions_truncate \ + --leftmost-questions-truncate -1 \ --cmd "$train_cmd" 1200 data/train $lang $ali_dir $treedir fi @@ -113,11 +123,11 @@ if [ $stage -le 7 ]; then mkdir -p $dir mkdir -p $dir/configs cat < $dir/configs/network.xconfig - relu-renorm-layer name=tdnn7-target input=Append(tdnn6.renorm@-3,tdnn6.renorm@0) dim=$dim + relu-renorm-layer name=tdnn7-target input=Append(tdnn6.renorm@-3,tdnn6.renorm@0) dim=450 ## adding the layers for chain branch - relu-renorm-layer name=prefinal-chain input=tdnn7-target dim=$dim target-rms=0.5 + relu-renorm-layer name=prefinal-chain input=tdnn7-target dim=450 target-rms=0.5 output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 - relu-renorm-layer name=prefinal-xent input=tdnn7-target dim=$dim target-rms=0.5 + relu-renorm-layer name=prefinal-xent input=tdnn7-target dim=450 target-rms=0.5 output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 EOF steps/nnet3/xconfig_to_configs.py --existing-model $src_mdl \ @@ -137,11 +147,13 @@ if [ $stage -le 8 ]; then utils/create_split_dir.pl \ /export/b0{3,4,5,6}/$USER/kaldi-data/egs/rm-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage fi + ivector_dir= + if $use_ivector; then ivector_dir="exp/nnet2${nnet_affix}/ivectors" ; fi steps/nnet3/chain/train.py --stage $train_stage \ --cmd "$decode_cmd" \ --trainer.input-model $dir/input.raw \ - --feat.online-ivector-dir exp/nnet2${nnet_affix}/ivectors \ + --feat.online-ivector-dir "$ivector_dir" \ --chain.xent-regularize $xent_regularize \ --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ --chain.xent-regularize 0.1 \ @@ -151,36 +163,39 @@ if [ $stage -le 8 ]; then --chain.lm-opts="--num-extra-lm-states=200" \ --egs.dir "$common_egs_dir" \ --egs.opts "--frames-overlap-per-eg 0" \ - --egs.chunk-width $frames_per_eg \ - --trainer.num-chunk-per-minibatch=$minibatch_size \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch=128 \ --trainer.frames-per-iter 1000000 \ - --trainer.num-epochs $num_epochs \ - --trainer.optimization.num-jobs-initial=$num_jobs_initial \ - --trainer.optimization.num-jobs-final=$num_jobs_final \ - --trainer.optimization.initial-effective-lrate=$initial_effective_lrate \ - --trainer.optimization.final-effective-lrate=$final_effective_lrate \ - --trainer.max-param-change $max_param_change \ - --cleanup.remove-egs false \ + --trainer.num-epochs 2 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=4 \ + --trainer.optimization.initial-effective-lrate=0.005 \ + --trainer.optimization.final-effective-lrate=0.0005 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs true \ --feat-dir data/train_hires \ --tree-dir $treedir \ --lat-dir exp/tri3b_lats \ --dir $dir || exit 1; fi -if [ $stage -le 9 ]; then +if [ $stage -le 9 ] && $use_ivector; then steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 4 \ - data/test $srcdir/exp/nnet3/extractor exp/nnet2${nnet_affix}/ivectors_test || exit 1; + data/test $src_ivec_extractor_dir exp/nnet2${nnet_affix}/ivectors_test || exit 1; fi if [ $stage -le 10 ]; then # Note: it might appear that this $lang directory is mismatched, and it is as # far as the 'topo' is concerned, but this script doesn't read the 'topo' from # the lang directory. + ivec_opt="" + if $use_ivector;then + ivec_opt="--online-ivector-dir exp/nnet2${nnet_affix}/ivectors_test" + fi utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ --scoring-opts "--min-lmwt 1" \ - --nj 20 --cmd "$decode_cmd" \ - --online-ivector-dir exp/nnet2${nnet_affix}/ivectors_test \ + --nj 20 --cmd "$decode_cmd" $ivec_opt \ $dir/graph data/test_hires $dir/decode || exit 1; fi wait; diff --git a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1b.sh b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1b.sh index d688e1c9e99..2e848b7bd94 100755 --- a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1b.sh +++ b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1b.sh @@ -26,29 +26,47 @@ set -e stage=7 train_stage=-4 get_egs_stage=-10 - -# training options -num_epochs=2 -initial_effective_lrate=0.005 -final_effective_lrate=0.0005 -max_param_change=2.0 -num_jobs_initial=2 -num_jobs_final=4 -minibatch_size=128 -frames_per_eg=150 -remove_egs=false +tdnn_affix=_1b # configs for transfer learning common_egs_dir= -srcdir=../../wsj/s5 # base directory for source data -src_tdnn_affix=1d -primary_lr_factor=0.25 # learning-rate factor for all except last layer in transferring source model -final_lr_factor=1.0 # learning-rate factor for final layer in transferring source model. +primary_lr_factor=0.25 # The learning-rate factor for transferred layers from source + # model. e.g. if 0, it fixed the paramters transferred from source. + # The learning-rate factor for new added layers is 1.0. nnet_affix=_online_wsj phone_lm_scales="1,10" # comma-separated list of integer valued scale weights # to scale different phone sequences for different alignments # e.g. (src-weight,target-weight)=(1,10) -tdnn_affix=_1b + +# model and dirs for source model used for transfer learning +src_mdl=../../wsj/s5/exp/chain/tdnn1d_sp/final.mdl # input chain model + # trained on source dataset (wsj). + # This model is transfered to the target domain. + +src_mfcc_config=../../wsj/s5/conf/mfcc_hires.conf # mfcc config used to extract higher dim + # mfcc features used for ivector training + # in source domain. +src_ivec_extractor_dir= # source ivector extractor dir used to extract ivector for + # source data and the ivector for target data is extracted using this extractor. + # It should be nonempty, if ivector is used in source model training. + +src_lang=../../wsj/s5/data/lang # source lang directory used to train source model. + # new new lang dir for transfer learning experiment is prepared + # using source phone set and lexicon in src_lang and + # word.txt target lang dir. + +src_dict=../../wsj/s5/data/local/dict_nosp # dictionary for source dataset containing lexicon.txt, + # nonsilence_phones.txt,... + # lexicon.txt used to generate lexicon.txt for + # src-to-tgt transfer. + +src_gmm_dir=../../wsj/s5/exp/tri4b # source gmm dir used to generate alignments + # for target data. + +src_tree_dir=../../wsj/s5/exp/chain/tree_a_sp # chain tree-dir for src data; + # the alignment in target domain is + # converted using src-tree + # End configuration section. echo "$0 $@" # Print the command line for logging @@ -69,24 +87,6 @@ fi # nnet3 setup, and you can skip them by setting "--stage 8" if you have already # run those things. - -# src directories -src_extractor_dir=$srcdir/exp/nnet3/extractor -src_mdl=$srcdir/exp/chain/tdnn${src_tdnn_affix}_sp/final.mdl # input dnn model for source data - # that is used in transfer learning. - -src_lang=$srcdir/data/lang # source lang directory used to generate source model. - # new new lang dir for transfer learning prepared - # using source phone set, lexicon in src_lang and - # target word list. - -src_gmm_dir=$srcdir/exp/tri4b # source gmm dir used to generate alignments - # for target data. - -src_tree_dir=$srcdir/exp/chain/tree_a_sp # chain tree-dir for src data; - # the alignment in target domain is - # converted using src-tree - # dirs for src-to-tgt transfer learning experiment lang_src_tgt=data/lang_wsj_rm # This dir is prepared using phones.txt and lexicon from # WSJ and wordlist and G.fst from RM. @@ -95,7 +95,26 @@ lat_dir=exp/tri3b_lats${src_tree_dir:+_wsj} dir=exp/chain/tdnn_wsj_rm${tdnn_affix} -required_files="$src_mdl $src_extractor_dir/final.dubm $src_extractor_dir/final.mat $src_extractor_dir/final.ie $src_lang/phones.txt $srcdir/data/local/dict_nosp/lexicon.txt $src_gmm_dir/final.mdl $src_tree_dir/tree" +required_files="$src_mfcc_config $src_mdl $src_lang/phones.txt $src_dict/lexicon.txt $src_gmm_dir/final.mdl $src_tree_dir/tree" + + +use_ivector=false +ivector_dim=$(nnet3-am-info --print-args=false $src_mdl | grep "ivector-dim" | cut -d" " -f2) +if [ "$ivector_dim" == "" ]; then ivector_dim=0 ; fi + +if [ ! -z $src_ivec_extractor_dir ]; then + if [ $ivector_dim -eq 0 ]; then + echo "source ivector extractor dir '$src_ivec_extractor_dir' is specified but ivector is not used in training the source model '$src_mdl'." + else + required_files="$required_files $src_ivec_extractor_dir/final.dubm $src_ivec_extractor_dir/final.mat $src_ivec_extractor_dir/final.ie" + use_ivector=true + fi +else + if [ $ivector_dim -gt 0 ]; then + echo "ivector is used in training the source model '$src_mdl' but no ivector extractor dir for source model specified." && exit 1; + fi +fi + for f in $required_files; do if [ ! -f $f ]; then @@ -106,7 +125,7 @@ done if [ $stage -le -1 ]; then echo "$0: prepare lexicon.txt for RM using WSJ lexicon." if ! cmp -s <(grep -v "^#" $src_lang/phones.txt) <(grep -v "^#" data/lang/phones.txt); then - local/prepare_wsj_rm_lang.sh $srcdir/data/local/dict_nosp $srcdir/data/lang $lang_src_tgt + local/prepare_wsj_rm_lang.sh $src_dict $src_lang $lang_src_tgt else rm -rf $lang_src_tgt 2>/dev/null || true cp -r data/lang $lang_src_tgt @@ -114,10 +133,10 @@ if [ $stage -le -1 ]; then fi local/online/run_nnet2_common.sh --stage $stage \ - --ivector-dim 100 \ + --ivector-dim $ivector_dim \ --nnet-affix "$nnet_affix" \ - --mfcc-config $srcdir/conf/mfcc_hires.conf \ - --extractor $src_extractor_dir || exit 1; + --mfcc-config $src_mfcc_config \ + --extractor $src_ivec_extractor_dir || exit 1; if [ $stage -le 4 ]; then echo "$0: Generate alignment using source model." @@ -156,14 +175,15 @@ if [ $stage -le 8 ]; then /export/b0{3,4,5,6}/$USER/kaldi-data/egs/rm-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage fi # exclude phone_LM and den.fst generation training stages - if [ $train_stage -lt -4 ]; then - train_stage=-4 - fi + if [ $train_stage -lt -4 ]; then train_stage=-4 ; fi + + ivector_dir= + if $use_ivector; then ivector_dir="exp/nnet2${nnet_affix}/ivectors" ; fi steps/nnet3/chain/train.py --stage $train_stage \ --cmd "$decode_cmd" \ --trainer.input-model $dir/input.raw \ - --feat.online-ivector-dir exp/nnet2${nnet_affix}/ivectors \ + --feat.online-ivector-dir "$ivector_dir" \ --chain.xent-regularize 0.1 \ --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ --chain.xent-regularize 0.1 \ @@ -172,36 +192,37 @@ if [ $stage -le 8 ]; then --chain.apply-deriv-weights false \ --egs.dir "$common_egs_dir" \ --egs.opts "--frames-overlap-per-eg 0" \ - --egs.chunk-width $frames_per_eg \ - --trainer.num-chunk-per-minibatch=$minibatch_size \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch=128 \ --trainer.frames-per-iter 1000000 \ - --trainer.num-epochs $num_epochs \ - --trainer.optimization.num-jobs-initial=$num_jobs_initial \ - --trainer.optimization.num-jobs-final=$num_jobs_final \ - --trainer.optimization.initial-effective-lrate=$initial_effective_lrate \ - --trainer.optimization.final-effective-lrate=$final_effective_lrate \ - --trainer.max-param-change $max_param_change \ - --cleanup.remove-egs false \ + --trainer.num-epochs 2 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=4 \ + --trainer.optimization.initial-effective-lrate=0.005 \ + --trainer.optimization.final-effective-lrate=0.0005 \ + --trainer.max-param-change 2 \ + --cleanup.remove-egs true \ --feat-dir data/train_hires \ --tree-dir $src_tree_dir \ --lat-dir $lat_dir \ --dir $dir || exit 1; fi -if [ $stage -le 9 ]; then +if [ $stage -le 9 ] && $use_ivector; then steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 4 \ - data/test_hires $srcdir/exp/nnet3/extractor exp/nnet2${nnet_affix}/ivectors_test || exit 1; + data/test_hires $src_ivec_extractor_dir exp/nnet2${nnet_affix}/ivectors_test || exit 1; fi if [ $stage -le 10 ]; then # Note: it might appear that this $lang directory is mismatched, and it is as # far as the 'topo' is concerned, but this script doesn't read the 'topo' from # the lang directory. + ivec_opt="" + if $use_ivector;then ivec_opt="--online-ivector-dir exp/nnet2${nnet_affix}/ivectors_test" ; fi utils/mkgraph.sh --self-loop-scale 1.0 $lang_src_tgt $dir $dir/graph steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ --scoring-opts "--min-lmwt 1" \ - --nj 20 --cmd "$decode_cmd" \ - --online-ivector-dir exp/nnet2${nnet_affix}/ivectors_test \ + --nj 20 --cmd "$decode_cmd" $ivec_opt \ $dir/graph data/test_hires $dir/decode || exit 1; fi wait; diff --git a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1c.sh b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1c.sh index d1df31b93ac..9203f9ad1dc 100755 --- a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1c.sh +++ b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1c.sh @@ -32,32 +32,41 @@ train_stage=-4 get_egs_stage=-10 dir=exp/chain/tdnn_wsj_rm_1c -# training options -frames_per_chunk=150 -num_epochs=2 -initial_effective_lrate=0.005 -final_effective_lrate=0.0005 -max_param_change=2.0 -num_jobs_initial=2 -num_jobs_final=4 -minibatch_size=128 -frames_per_eg=150 -remove_egs=false -xent_regularize=0.1 - # configs for transfer learning -common_egs_dir= -srcdir=../../wsj/s5 # base directory for source data which is the base dir - # for source ivector extractor, scr chain model and - # lang dir. -src_tdnn_affix=1d # tdnn affix used for src chain model. +common_egs_dir= primary_lr_factor=0.25 # learning-rate factor for all except last layer in transferred source model nnet_affix=_online_wsj phone_lm_scales="1,10" # comma-separated list of integer valued scale weights # to scale different phone sequences for different alignments # e.g. (src-weight,target-weight)=(10,1) + +# model and dirs for source model used for transfer learning +src_mdl=../../wsj/s5/exp/chain/tdnn1d_sp/final.mdl # input chain model + # trained on source dataset (wsj). + # This model is transfered to the target domain. + +src_mfcc_config=../../wsj/s5/conf/mfcc_hires.conf # mfcc config used to extract higher dim + # mfcc features used for ivector training + # in source domain. +src_ivec_extractor_dir= # source ivector extractor dir used to extract ivector for + # source data and the ivector for target data is extracted using this extractor. + # It should be nonempty, if ivector is used in source model training. + +src_lang=../../wsj/s5/data/lang # source lang directory used to train source model. + # new new lang dir for transfer learning experiment is prepared + # using source phone set and lexicon in src_lang and + # word.txt target lang dir. +src_dict=../../wsj/s5/data/local/dict_nosp # dictionary for source dataset containing lexicon.txt, + # nonsilence_phones.txt,... + # lexicon.txt used to generate lexicon.txt for + # src-to-tgt transfer. + +src_tree_dir=../../wsj/s5/exp/chain/tree_a_sp # chain tree-dir for src data; + # the alignment in target domain is + # converted using src-tree + # End configuration section. echo "$0 $@" # Print the command line for logging @@ -78,28 +87,32 @@ fi # nnet3 setup, and you can skip them by setting "--stage 8" if you have already # run those things. -# src directories -src_extractor_dir=$srcdir/exp/nnet3/extractor -src_mdl=$srcdir/exp/chain/tdnn${src_tdnn_affix}_sp/final.mdl # input dnn model for source data - # that is used in transfer learning. - -src_lang=$srcdir/data/lang # source lang directory used to generate source model. - # new new lang dir for transfer learning prepared - # using source phone set, lexicon in src_lang and - # target word list. - -src_tree_dir=$srcdir/exp/chain/tree_a_sp # chain tree-dir for source dataset; - # the alignment in target domain is - # converted using src-tree - # dirs for src-to-tgt transfer experiment -lang=data/lang_chain_5n_wsj +lang_dir=data/lang_chain_5n # lang dir for target data. lang_src_tgt=data/lang_wsj_rm # This dir is prepared using phones.txt and lexicon from # source(WSJ) and and wordlist and G.fst from target(RM) ali_dir=exp/chain/chain_ali_wsj -lat_dir=exp/chain_lats${src_tree_dir:+_wsj} +lat_dir=exp/chain_lats_wsj + +required_files="$src_mfcc_config $src_mdl $src_lang/phones.txt $src_dict/lexicon.txt $src_tree_dir/tree" + +use_ivector=false +ivector_dim=$(nnet3-am-info --print-args=false $src_mdl | grep "ivector-dim" | cut -d" " -f2) +if [ "$ivector_dim" == "" ]; then ivector_dim=0 ; fi + +if [ ! -z $src_ivec_extractor_dir ]; then + if [ $ivector_dim -eq 0 ]; then + echo "source ivector extractor dir '$src_ivec_extractor_dir' is specified but ivector is not used in training the source model '$src_mdl'." + else + required_files="$required_files $src_ivec_extractor_dir/final.dubm $src_ivec_extractor_dir/final.mat $src_ivec_extractor_dir/final.ie" + use_ivector=true + fi +else + if [ $ivector_dim -gt 0 ]; then + echo "ivector is used in training the source model '$src_mdl' but no ivector extractor dir for source model specified." && exit 1; + fi +fi -required_files="$src_mdl $src_extractor_dir/final.dubm $src_extractor_dir/final.mat $src_extractor_dir/final.ie $src_lang/phones.txt $srcdir/data/local/dict_nosp/lexicon.txt $src_tree_dir/tree" for f in $required_files; do if [ ! -f $f ]; then @@ -109,39 +122,40 @@ done if [ $stage -le -1 ]; then echo "$0: prepare lang for RM-WSJ using WSJ phone set and lexicon and RM word list." - if ! cmp -s <(grep -v "^#" $src_lang/phones.txt) <(grep -v "^#" data/lang/phones.txt); then - local/prepare_wsj_rm_lang.sh $srcdir/data/local/dict_nosp $src_lang $lang_dir + if ! cmp -s <(grep -v "^#" $src_lang/phones.txt) <(grep -v "^#" $lang_dir/phones.txt); then + local/prepare_wsj_rm_lang.sh $src_dict $src_lang $lang_src_tgt || exit 1; else - rm -rf $lang_dir 2>/dev/null || true - cp -r data/lang $lang_dir + rm -rf $lang_src_tgt 2>/dev/null || true + cp -r $lang_dir $lang_src_tgt fi fi local/online/run_nnet2_common.sh --stage $stage \ - --ivector-dim 100 \ + --ivector-dim $ivector_dim \ --nnet-affix "$nnet_affix" \ - --mfcc-config $srcdir/conf/mfcc_hires.conf \ - --extractor $src_extractor_dir || exit 1; + --mfcc-config $src_mfcc_config \ + --extractor $src_ivec_extractor_dir || exit 1; src_mdl_dir=`dirname $src_mdl` +ivec_opt="" +if $use_ivector;then ivec_opt="--online-ivector-dir exp/nnet2${nnet_affix}/ivectors" ; fi + if [ $stage -le 4 ]; then echo "$0: Generate alignment using source chain model." - steps/nnet3/align.sh --nj 100 --cmd "$train_cmd" \ - --online-ivector-dir exp/nnet2${nnet_affix}/ivectors \ + steps/nnet3/align.sh --nj 100 --cmd "$train_cmd" $ivec_opt \ --extra-left-context-initial 0 --extra-right-context-final 0 \ --scale-opts "--transition-scale=1.0 --acoustic-scale=1.0 --self-loop-scale=1.0" \ - --frames-per-chunk $frames_per_chunk \ + --frames-per-chunk 150 \ data/train_hires $lang_src_tgt $src_mdl_dir $ali_dir || exit 1; fi if [ $stage -le 5 ]; then # Get the alignments as lattices (gives the chain training more freedom). # use the same num-jobs as the alignments - steps/nnet3/align_lats.sh --nj 100 --cmd "$train_cmd" \ + steps/nnet3/align_lats.sh --nj 100 --cmd "$train_cmd" $ivec_opt \ --acoustic-scale 1.0 --extra-left-context-initial 0 --extra-right-context-final 0 \ - --frames-per-chunk $frames_per_chunk \ + --frames-per-chunk 150 \ --scale-opts "--transition-scale=1.0 --self-loop-scale=1.0" \ - --online-ivector-dir exp/nnet2${nnet_affix}/ivectors data/train_hires \ - $lang_src_tgt $src_mdl_dir $lat_dir || exit 1; + data/train_hires $lang_src_tgt $src_mdl_dir $lat_dir || exit 1; rm $lat_dir/fsts.*.gz # save space fi @@ -165,9 +179,11 @@ if [ $stage -le 8 ]; then /export/b0{3,4,5,6}/$USER/kaldi-data/egs/rm-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage fi # exclude phone_LM and den.fst generation training stage - if [ $train_stage -lt -4 ]; then - train_stage=-4 - fi + if [ $train_stage -lt -4 ]; then train_stage=-4 ; fi + + ivector_dir= + if $use_ivector; then ivector_dir="exp/nnet2${nnet_affix}/ivectors" ; fi + # we use chain model from source to generate lats for target and the # tolerance used in chain egs generation using this lats should be 1 or 2 which is # (source_egs_tolerance/frame_subsampling_factor) @@ -176,8 +192,8 @@ if [ $stage -le 8 ]; then steps/nnet3/chain/train.py --stage $train_stage ${chain_opts[@]} \ --cmd "$decode_cmd" \ --trainer.input-model $dir/input.raw \ - --feat.online-ivector-dir exp/nnet2${nnet_affix}/ivectors \ - --chain.xent-regularize $xent_regularize \ + --feat.online-ivector-dir "$ivector_dir" \ + --chain.xent-regularize 0.1 \ --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ --chain.xent-regularize 0.1 \ --chain.leaky-hmm-coefficient 0.1 \ @@ -185,36 +201,38 @@ if [ $stage -le 8 ]; then --chain.apply-deriv-weights false \ --egs.dir "$common_egs_dir" \ --egs.opts "--frames-overlap-per-eg 0" \ - --egs.chunk-width $frames_per_eg \ - --trainer.num-chunk-per-minibatch=$minibatch_size \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch=128 \ --trainer.frames-per-iter 1000000 \ - --trainer.num-epochs $num_epochs \ - --trainer.optimization.num-jobs-initial=$num_jobs_initial \ - --trainer.optimization.num-jobs-final=$num_jobs_final \ - --trainer.optimization.initial-effective-lrate=$initial_effective_lrate \ - --trainer.optimization.final-effective-lrate=$final_effective_lrate \ - --trainer.max-param-change $max_param_change \ - --cleanup.remove-egs false \ + --trainer.num-epochs 2 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=4 \ + --trainer.optimization.initial-effective-lrate=0.005 \ + --trainer.optimization.final-effective-lrate=0.0005 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs true \ --feat-dir data/train_hires \ --tree-dir $src_tree_dir \ --lat-dir $lat_dir \ --dir $dir || exit 1; fi -if [ $stage -le 9 ]; then +if [ $stage -le 9 ] && $use_ivector; then steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 4 \ - data/test_hires $srcdir/exp/nnet3/extractor exp/nnet2${nnet_affix}/ivectors_test || exit 1; + data/test_hires $src_ivec_extractor_dir exp/nnet2${nnet_affix}/ivectors_test || exit 1; fi if [ $stage -le 10 ]; then # Note: it might appear that this $lang directory is mismatched, and it is as # far as the 'topo' is concerned, but this script doesn't read the 'topo' from # the lang directory. + tes_ivec_opt="" + if $use_ivector;then test_ivec_opt="--online-ivector-dir exp/nnet2${nnet_affix}/ivectors_test" ; fi + utils/mkgraph.sh --self-loop-scale 1.0 $lang_src_tgt $dir $dir/graph steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ --scoring-opts "--min-lmwt 1" \ - --nj 20 --cmd "$decode_cmd" \ - --online-ivector-dir exp/nnet2${nnet_affix}/ivectors_test \ + --nj 20 --cmd "$decode_cmd" $test_ivec_opt \ $dir/graph data/test_hires $dir/decode || exit 1; fi wait; diff --git a/egs/rm/s5/local/online/run_nnet2_common.sh b/egs/rm/s5/local/online/run_nnet2_common.sh index e1a4676da66..82b7c346aa2 100755 --- a/egs/rm/s5/local/online/run_nnet2_common.sh +++ b/egs/rm/s5/local/online/run_nnet2_common.sh @@ -8,6 +8,8 @@ nnet_affix=_online extractor=exp/nnet2${nnet_affix}/extractor ivector_dim=50 mfcc_config=conf/mfcc_hires.conf +use_ivector=true # If false, it skips training ivector extractor and + # ivector extraction stages. . cmd.sh . ./path.sh . ./utils/parse_options.sh @@ -48,7 +50,7 @@ if [ $stage -le 0 ]; then fi train_set=${train_set}_hires -if [ ! -f $extractor/final.dubm ]; then +if [ ! -f $extractor/final.ie ] && [ $ivector_dim -gt 0 ]; then if [ $stage -le 1 ]; then mkdir -p exp/nnet2${nnet_affix} steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 40 --num-frames 200000 \ @@ -64,7 +66,7 @@ if [ ! -f $extractor/final.dubm ]; then fi fi -if [ $stage -le 3 ]; then +if [ $stage -le 3 ] && [ $ivector_dim -gt 0 ]; then # having a larger number of speakers is helpful for generalization, and to # handle per-utterance decoding well (iVector starts at zero). steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2 data/${train_set} data/${train_set}_max2 From 775b34d981b35a3b42947660253971cd6ff1d2b1 Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Wed, 23 Aug 2017 10:53:41 -0400 Subject: [PATCH 044/174] minor change to prepare for tf learning --- egs/wsj/s5/steps/libs/nnet3/train/common.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/egs/wsj/s5/steps/libs/nnet3/train/common.py b/egs/wsj/s5/steps/libs/nnet3/train/common.py index a3beb2e5bef..caf6d953411 100644 --- a/egs/wsj/s5/steps/libs/nnet3/train/common.py +++ b/egs/wsj/s5/steps/libs/nnet3/train/common.py @@ -422,6 +422,11 @@ def verify_egs_dir(egs_dir, feat_dim, ivector_dim, ivector_extractor_id, egs_left_context, egs_right_context, left_context, right_context)) + if left_context_initial == -1: + left_context_initial = left_context + if right_context_final == -1: + right_context_final = right_context + # the condition on the initial/final context is an equality condition, # not an inequality condition, as there is no mechanism to 'correct' the # context (by subtracting context) while copying the egs, like there is From e0fd23ecd56cf35e1a0e87001c772adb0a646345 Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Wed, 23 Aug 2017 15:34:50 -0400 Subject: [PATCH 045/174] semisup: Separate tolerance for silence --- egs/fisher_english/s5/local/score.sh | 61 +------------------ egs/fisher_english/s5/local/wer_output_filter | 16 +++++ egs/wsj/s5/steps/nnet3/chain/get_egs.sh | 13 ++++ src/chain/chain-supervision.cc | 31 +++++++++- src/chain/chain-supervision.h | 13 +++- 5 files changed, 70 insertions(+), 64 deletions(-) mode change 100755 => 120000 egs/fisher_english/s5/local/score.sh create mode 100755 egs/fisher_english/s5/local/wer_output_filter diff --git a/egs/fisher_english/s5/local/score.sh b/egs/fisher_english/s5/local/score.sh deleted file mode 100755 index 702a57c94c6..00000000000 --- a/egs/fisher_english/s5/local/score.sh +++ /dev/null @@ -1,60 +0,0 @@ -#!/bin/bash -# Copyright Johns Hopkins University (Author: Daniel Povey) 2012. Apache 2.0. - -# begin configuration section. -cmd=run.pl -min_lmwt=5 -max_lmwt=17 -iter= -#end configuration section. - -[ -f ./path.sh ] && . ./path.sh -. parse_options.sh || exit 1; - -if [ $# -ne 3 ]; then - echo "Usage: $0 [--cmd (run.pl|queue.pl...)] " - echo " Options:" - echo " --cmd (run.pl|queue.pl...) # specify how to run the sub-processes." - echo " --min_lmwt # minumum LM-weight for lattice rescoring " - echo " --max_lmwt # maximum LM-weight for lattice rescoring " - exit 1; -fi - -data=$1 -lang=$2 # Note: may be graph directory not lang directory, but has the necessary stuff copied. -dir=$3 - -model=$dir/../final.mdl # assume model one level up from decoding dir. - -for f in $data/text $lang/words.txt $dir/lat.1.gz; do - [ ! -f $f ] && echo "$0: expecting file $f to exist" && exit 1; -done - -name=`basename $data`; # e.g. eval2000 - -mkdir -p $dir/scoring/log - - -function filter_text { - perl -e 'foreach $w (@ARGV) { $bad{$w} = 1; } - while() { @A = split(" ", $_); $id = shift @A; print "$id "; - foreach $a (@A) { if (!defined $bad{$a}) { print "$a "; }} print "\n"; }' \ - '[NOISE]' '[LAUGHTER]' '[VOCALIZED-NOISE]' '' '%HESITATION' -} - -$cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/best_path.LMWT.log \ - lattice-best-path --lm-scale=LMWT --word-symbol-table=$lang/words.txt \ - "ark:gunzip -c $dir/lat.*.gz|" ark,t:$dir/scoring/LMWT.tra || exit 1; - -for lmwt in `seq $min_lmwt $max_lmwt`; do - utils/int2sym.pl -f 2- $lang/words.txt <$dir/scoring/$lmwt.tra | \ - filter_text > $dir/scoring/$lmwt.txt || exit 1; -done - -filter_text <$data/text >$dir/scoring/text.filt - -$cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.LMWT.log \ - compute-wer --text --mode=present \ - ark:$dir/scoring/text.filt ark:$dir/scoring/LMWT.txt ">&" $dir/wer_LMWT || exit 1; - -exit 0 diff --git a/egs/fisher_english/s5/local/score.sh b/egs/fisher_english/s5/local/score.sh new file mode 120000 index 00000000000..6a200b42ed3 --- /dev/null +++ b/egs/fisher_english/s5/local/score.sh @@ -0,0 +1 @@ +../steps/scoring/score_kaldi_wer.sh \ No newline at end of file diff --git a/egs/fisher_english/s5/local/wer_output_filter b/egs/fisher_english/s5/local/wer_output_filter new file mode 100755 index 00000000000..2514c385038 --- /dev/null +++ b/egs/fisher_english/s5/local/wer_output_filter @@ -0,0 +1,16 @@ +#!/usr/bin/perl + +@filter_words = ('[NOISE]', '[LAUGHTER]', '[VOCALIZED-NOISE]', '', '%HESITATION'); +foreach $w (@filter_words) { + $bad{$w} = 1; $w = lc $w; $bad{$w} = 1; +} +while() { + @A = split(" ", $_); + $id = shift @A; + print "$id "; + + foreach $a (@A) { + if (!defined $bad{$a}) { print "$a "; } + } + print "\n"; +} diff --git a/egs/wsj/s5/steps/nnet3/chain/get_egs.sh b/egs/wsj/s5/steps/nnet3/chain/get_egs.sh index f3202778daa..ad726686e09 100755 --- a/egs/wsj/s5/steps/nnet3/chain/get_egs.sh +++ b/egs/wsj/s5/steps/nnet3/chain/get_egs.sh @@ -49,6 +49,9 @@ frames_per_iter=400000 # each iteration of training, see this many frames per right_tolerance= #CTC right tolerance == max label delay. left_tolerance= +right_tolerance_silence= # Tolerances for silence phones +left_tolerance_silence= + transform_dir= # If supplied, overrides latdir as the place to find fMLLR transforms stage=0 @@ -309,6 +312,16 @@ fi [ ! -z $phone_insertion_penalty ] && \ chain_supervision_all_opts="$chain_supervision_all_opts --phone-ins-penalty=$phone_insertion_penalty" +[ ! -z $right_tolerance_silence ] && \ + chain_supervision_all_opts="$chain_supervision_all_opts --right-tolerance-silence=$right_tolerance_silence" + +[ ! -z $left_tolerance_silence ] && \ + chain_supervision_all_opts="$chain_supervision_all_opts --left-tolerance-silence=$left_tolerance_silence" + +if [ ! -z $left_tolerance_silence ] && [ ! -z $right_tolerance_silence ]; then + chain_supervision_all_opts="$chain_supervision_all_opts --silence-phones=$(cat $lang/phones/silence_phones.csl)" +fi + echo $left_context > $dir/info/left_context echo $right_context > $dir/info/right_context echo $left_context_initial > $dir/info/left_context_initial diff --git a/src/chain/chain-supervision.cc b/src/chain/chain-supervision.cc index fb0f0284df7..f7b5caf0e17 100644 --- a/src/chain/chain-supervision.cc +++ b/src/chain/chain-supervision.cc @@ -76,6 +76,11 @@ void SupervisionOptions::Check() const { KALDI_ASSERT(left_tolerance >= 0 && right_tolerance >= 0 && frame_subsampling_factor > 0 && left_tolerance + right_tolerance >= frame_subsampling_factor); + + if (!silence_phones_str.empty()) { + KALDI_ASSERT(left_tolerance_silence >= 0 && right_tolerance_silence >= 0 && + left_tolerance_silence + right_tolerance_silence >= frame_subsampling_factor); + } } bool AlignmentToProtoSupervision(const SupervisionOptions &opts, @@ -145,8 +150,18 @@ bool ProtoSupervision::operator == (const ProtoSupervision &other) const { bool PhoneLatticeToProtoSupervisionInternal(const SupervisionOptions &opts, const CompactLattice &lat, - ProtoSupervision *proto_supervision) { + ProtoSupervision *proto_supervision) { opts.Check(); + + ConstIntegerSet silence_set; + if (!opts.silence_phones_str.empty()) { + std::vector silence_phones; + if (!SplitStringToIntegers(opts.silence_phones_str, ":,", false, + &silence_phones)) + KALDI_ERR << "Invalid silence-phones string " << opts.silence_phones_str; + silence_set.Init(silence_phones); + } + if (lat.NumStates() == 0) { KALDI_WARN << "Empty lattice provided"; return false; @@ -182,9 +197,19 @@ bool PhoneLatticeToProtoSupervisionInternal(const SupervisionOptions &opts, lat_arc.weight.Weight().Value1() * opts.lm_scale + opts.phone_ins_penalty), lat_arc.nextstate)); - int32 t_begin = std::max(0, (state_time - opts.left_tolerance)), + + int32 left_tolerance = opts.left_tolerance; + int32 right_tolerance = opts.right_tolerance; + if (!opts.silence_phones_str.empty()) { + if (silence_set.count(phone) > 0) { + left_tolerance = opts.left_tolerance_silence; + right_tolerance = opts.right_tolerance_silence; + } + } + + int32 t_begin = std::max(0, (state_time - left_tolerance)), t_end = std::min(num_frames, - (next_state_time + opts.right_tolerance)), + (next_state_time + right_tolerance)), t_begin_subsampled = (t_begin + factor - 1)/ factor, t_end_subsampled = (t_end + factor - 1)/ factor; for (int32 t_subsampled = t_begin_subsampled; diff --git a/src/chain/chain-supervision.h b/src/chain/chain-supervision.h index ce755f0cb63..88a5c05efbe 100644 --- a/src/chain/chain-supervision.h +++ b/src/chain/chain-supervision.h @@ -53,13 +53,18 @@ struct SupervisionOptions { BaseFloat weight; BaseFloat lm_scale; BaseFloat phone_ins_penalty; + int32 left_tolerance_silence; + int32 right_tolerance_silence; + std::string silence_phones_str; SupervisionOptions(): left_tolerance(5), right_tolerance(5), frame_subsampling_factor(1), weight(1.0), lm_scale(0.0), - phone_ins_penalty(0.0) { } + phone_ins_penalty(0.0), + left_tolerance_silence(0), + right_tolerance_silence(0) { } void Register(OptionsItf *opts) { opts->Register("left-tolerance", &left_tolerance, "Left tolerance for " @@ -78,6 +83,12 @@ struct SupervisionOptions { "supervision fst."); opts->Register("phone-ins-penalty", &phone_ins_penalty, "The penalty to penalize longer paths"); + opts->Register("left-tolerance-silence", &left_tolerance_silence, "Left tolerance for " + "shift in silence phone position relative to the alignment"); + opts->Register("right-tolerance-silence", &right_tolerance_silence, "Right tolerance for " + "shift in silence phone position relative to the alignment"); + opts->Register("silence-phones", &silence_phones_str, + "A comma separated list of silence phones"); } void Check() const; }; From 89e574b6835b3a6bc60cb7398abbd700008eb73f Mon Sep 17 00:00:00 2001 From: Pegita Date: Wed, 23 Aug 2017 20:11:27 -0400 Subject: [PATCH 046/174] modified comments in xconfig and train.py and modified scripts to generates alignments from lattices. --- .../local/chain/tuning/run_tdnn_wsj_rm_1a.sh | 25 ++++----- .../local/chain/tuning/run_tdnn_wsj_rm_1b.sh | 56 ++++++++----------- .../local/chain/tuning/run_tdnn_wsj_rm_1c.sh | 50 ++++++----------- egs/rm/s5/local/online/run_nnet2_common.sh | 3 + egs/wsj/s5/steps/align_fmllr_lats.sh | 16 ++++-- .../nnet3/train/chain_objf/acoustic_model.py | 17 +++--- egs/wsj/s5/steps/libs/nnet3/train/common.py | 12 ++-- .../steps/libs/nnet3/xconfig/basic_layers.py | 26 +++++---- egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py | 50 ++++++++--------- egs/wsj/s5/steps/libs/nnet3/xconfig/utils.py | 41 +++++++++++--- egs/wsj/s5/steps/nnet3/align_lats.sh | 11 +++- .../nnet3/chain/make_weighted_den_fst.sh | 27 +++++++-- egs/wsj/s5/steps/nnet3/chain/train.py | 30 +++++----- egs/wsj/s5/steps/nnet3/xconfig_to_configs.py | 6 +- 14 files changed, 201 insertions(+), 169 deletions(-) diff --git a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1a.sh b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1a.sh index b15d904eb20..b6852f0f812 100755 --- a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1a.sh +++ b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1a.sh @@ -1,12 +1,13 @@ #!/bin/bash -# This script uses weight transfer as a Transfer learning method -# and use already trained model on wsj and removes the last layer and -# add new randomly initialized layer and retrain the whole network, -# while training new added layer using rm data. +# This script uses weight transfer as a Transfer learning method to transfer +# model trained on wsj to rm dataset. +# It uses already trained model on wsj and removes its last layer and +# adds new randomly initialized layer and retrains the whole network with +# smaller learning-rate, while training new added layer using rm data. # The chain config is as in run_tdnn_5n.sh and the result is: #System tdnn_5n tdnn_wsj_rm_1a -#WER 2.71 2.09 +#WER 2.71 1.68 set -e # configs for 'chain' @@ -14,6 +15,7 @@ stage=0 train_stage=-10 get_egs_stage=-10 dir=exp/chain/tdnn_wsj_rm_1a +xent_regularize=0.1 # configs for transfer learning src_mdl=../../wsj/s5/exp/chain/tdnn1d_sp/final.mdl # input chain model @@ -21,7 +23,7 @@ src_mdl=../../wsj/s5/exp/chain/tdnn1d_sp/final.mdl # input chain model # This model is transfered to the target domain. src_mfcc_config=../../wsj/s5/conf/mfcc_hires.conf # mfcc config used to extract higher dim - # mfcc features used for ivector training + # mfcc features for ivector training # in source domain. src_ivec_extractor_dir= # source ivector extractor dir used to extract ivector for # source data and the ivector for target data is extracted using this extractor. @@ -118,7 +120,7 @@ fi if [ $stage -le 7 ]; then echo "$0: creating neural net configs using the xconfig parser for"; echo "extra layers w.r.t source network."; - num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + num_targets=$(tree-info --print-args=false $treedir/tree |grep num-pdfs|awk '{print $2}') learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) mkdir -p $dir mkdir -p $dir/configs @@ -145,7 +147,7 @@ if [ $stage -le 8 ]; then echo "$0: generate egs for chain to train new model on rm dataset." if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then utils/create_split_dir.pl \ - /export/b0{3,4,5,6}/$USER/kaldi-data/egs/rm-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/rm-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage fi ivector_dir= if $use_ivector; then ivector_dir="exp/nnet2${nnet_affix}/ivectors" ; fi @@ -179,12 +181,7 @@ if [ $stage -le 8 ]; then --dir $dir || exit 1; fi -if [ $stage -le 9 ] && $use_ivector; then - steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 4 \ - data/test $src_ivec_extractor_dir exp/nnet2${nnet_affix}/ivectors_test || exit 1; -fi - -if [ $stage -le 10 ]; then +if [ $stage -le 9 ]; then # Note: it might appear that this $lang directory is mismatched, and it is as # far as the 'topo' is concerned, but this script doesn't read the 'topo' from # the lang directory. diff --git a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1b.sh b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1b.sh index 2e848b7bd94..fe3d8f267dd 100755 --- a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1b.sh +++ b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1b.sh @@ -19,7 +19,7 @@ # while training the last layer with higher learning-rate. # The chain config is as run_tdnn_5n.sh and the result is: # System tdnn_5n tdnn_wsj_rm_1a tdnn_wsj_rm_1b tdnn_wsj_rm_1c -# WER 2.71 2.09 3.45 3.38 +# WER 2.71 1.68 3.45 3.38 set -e # configs for 'chain' @@ -44,24 +44,24 @@ src_mdl=../../wsj/s5/exp/chain/tdnn1d_sp/final.mdl # input chain model # This model is transfered to the target domain. src_mfcc_config=../../wsj/s5/conf/mfcc_hires.conf # mfcc config used to extract higher dim - # mfcc features used for ivector training + # mfcc features for ivector training # in source domain. src_ivec_extractor_dir= # source ivector extractor dir used to extract ivector for # source data and the ivector for target data is extracted using this extractor. # It should be nonempty, if ivector is used in source model training. -src_lang=../../wsj/s5/data/lang # source lang directory used to train source model. - # new new lang dir for transfer learning experiment is prepared - # using source phone set and lexicon in src_lang and - # word.txt target lang dir. +src_lang=../../wsj/s5/data/lang # source lang directory used to train source model. + # new lang dir for transfer learning experiment is prepared + # using source phone set phones.txt and lexicon.txt + # in src lang and dict dirs and words.txt in target lang dir. src_dict=../../wsj/s5/data/local/dict_nosp # dictionary for source dataset containing lexicon.txt, - # nonsilence_phones.txt,... - # lexicon.txt used to generate lexicon.txt for - # src-to-tgt transfer. + # nonsilence_phones.txt,... + # lexicon.txt used to generate lexicon.txt for + # src-to-tgt transfer. src_gmm_dir=../../wsj/s5/exp/tri4b # source gmm dir used to generate alignments - # for target data. + # for target data. src_tree_dir=../../wsj/s5/exp/chain/tree_a_sp # chain tree-dir for src data; # the alignment in target domain is @@ -90,8 +90,7 @@ fi # dirs for src-to-tgt transfer learning experiment lang_src_tgt=data/lang_wsj_rm # This dir is prepared using phones.txt and lexicon from # WSJ and wordlist and G.fst from RM. -ali_dir=exp/tri4b${src_tree_dir:+_wsj}_ali -lat_dir=exp/tri3b_lats${src_tree_dir:+_wsj} +lat_dir=exp/tri3b_lats_wsj dir=exp/chain/tdnn_wsj_rm${tdnn_affix} @@ -111,7 +110,7 @@ if [ ! -z $src_ivec_extractor_dir ]; then fi else if [ $ivector_dim -gt 0 ]; then - echo "ivector is used in training the source model '$src_mdl' but no ivector extractor dir for source model specified." && exit 1; + echo "ivector is used in training the source model '$src_mdl' but no ivector extractor dir for source model is specified." && exit 1; fi fi @@ -139,40 +138,34 @@ local/online/run_nnet2_common.sh --stage $stage \ --extractor $src_ivec_extractor_dir || exit 1; if [ $stage -le 4 ]; then - echo "$0: Generate alignment using source model." - steps/align_fmllr.sh --nj 100 --cmd "$train_cmd" \ - data/train $lang_src_tgt $src_gmm_dir $ali_dir || exit 1; -fi - - -if [ $stage -le 5 ]; then # Get the alignments as lattices (gives the chain training more freedom). # use the same num-jobs as the alignments nj=$(cat exp/tri3b_ali/num_jobs) || exit 1; - steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/train \ - $lang_src_tgt $src_gmm_dir $lat_dir || exit 1; - rm $lat_dir/fsts.*.gz # save space + steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" \ + --generate-ali-from-lats true --stage 4 \ + data/train $lang_src_tgt $src_gmm_dir $lat_dir || exit 1; + rm $lat_dir/fsts.*.gz 2>/dev/null || true # save space fi -if [ $stage -le 6 ]; then +if [ $stage -le 5 ]; then # set the learning-rate-factor for initial network to be primary_lr_factor." $train_cmd $dir/log/generate_input_mdl.log \ nnet3-am-copy --raw=true --edits="set-learning-rate-factor name=* learning-rate-factor=$primary_lr_factor; set-learning-rate-factor name=output* learning-rate-factor=1.0" \ $src_mdl $dir/input.raw || exit 1; fi -if [ $stage -le 7 ]; then +if [ $stage -le 6 ]; then echo "$0: compute {den,normalization}.fst using weighted phone LM with wsj and rm weight $phone_lm_scales." steps/nnet3/chain/make_weighted_den_fst.sh --cmd "$train_cmd" \ --weights $phone_lm_scales \ --lm-opts '--num-extra-lm-states=200' \ - $src_tree_dir $ali_dir $dir || exit 1; + $src_tree_dir $lat_dir $dir || exit 1; fi -if [ $stage -le 8 ]; then +if [ $stage -le 7 ]; then if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then utils/create_split_dir.pl \ - /export/b0{3,4,5,6}/$USER/kaldi-data/egs/rm-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/rm-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage fi # exclude phone_LM and den.fst generation training stages if [ $train_stage -lt -4 ]; then train_stage=-4 ; fi @@ -208,12 +201,7 @@ if [ $stage -le 8 ]; then --dir $dir || exit 1; fi -if [ $stage -le 9 ] && $use_ivector; then - steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 4 \ - data/test_hires $src_ivec_extractor_dir exp/nnet2${nnet_affix}/ivectors_test || exit 1; -fi - -if [ $stage -le 10 ]; then +if [ $stage -le 8 ]; then # Note: it might appear that this $lang directory is mismatched, and it is as # far as the 'topo' is concerned, but this script doesn't read the 'topo' from # the lang directory. diff --git a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1c.sh b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1c.sh index 9203f9ad1dc..0c70d0970ea 100755 --- a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1c.sh +++ b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1c.sh @@ -1,6 +1,6 @@ #!/bin/bash -# _1c is as _1b but it uses src chain model instead of GMM model to generate -# alignments for RM using SWJ model. +# _1c is as _1b but it uses source chain-trained DNN model instead of GMM model +# to generate alignments for RM using SWJ model. # _1b is as _1a, but different as follows # 1) uses src phone set phones.txt and new lexicon generated using word pronunciation @@ -18,11 +18,11 @@ # This script uses weight transfer as Transfer learning method -# and use already trained model on wsj and fine-tune the whole network using rm data -# while training the last layer with higher learning-rate. +# and use already trained model on wsj and fine-tune the whole network using +# rm data while training the last layer with higher learning-rate. # The chain config is as run_tdnn_5n.sh and the result is: # System tdnn_5n tdnn_wsj_rm_1a tdnn_wsj_rm_1b tdnn_wsj_rm_1c -# WER 2.71 2.09 3.45 3.38 +# WER 2.71 1.68 3.45 3.38 set -e @@ -44,8 +44,8 @@ phone_lm_scales="1,10" # comma-separated list of integer valued scale weights # model and dirs for source model used for transfer learning src_mdl=../../wsj/s5/exp/chain/tdnn1d_sp/final.mdl # input chain model - # trained on source dataset (wsj). - # This model is transfered to the target domain. + # trained on source dataset (wsj) and + # this model is transfered to the target domain. src_mfcc_config=../../wsj/s5/conf/mfcc_hires.conf # mfcc config used to extract higher dim # mfcc features used for ivector training @@ -54,10 +54,10 @@ src_ivec_extractor_dir= # source ivector extractor dir used to extract ivector # source data and the ivector for target data is extracted using this extractor. # It should be nonempty, if ivector is used in source model training. -src_lang=../../wsj/s5/data/lang # source lang directory used to train source model. - # new new lang dir for transfer learning experiment is prepared - # using source phone set and lexicon in src_lang and - # word.txt target lang dir. +src_lang=../../wsj/s5/data/lang # source lang directory used to train source model. + # new lang dir for transfer learning experiment is prepared + # using source phone set phone.txt and lexicon.txt in src lang dir and + # word.txt target lang dir. src_dict=../../wsj/s5/data/local/dict_nosp # dictionary for source dataset containing lexicon.txt, # nonsilence_phones.txt,... # lexicon.txt used to generate lexicon.txt for @@ -91,7 +91,6 @@ fi lang_dir=data/lang_chain_5n # lang dir for target data. lang_src_tgt=data/lang_wsj_rm # This dir is prepared using phones.txt and lexicon from # source(WSJ) and and wordlist and G.fst from target(RM) -ali_dir=exp/chain/chain_ali_wsj lat_dir=exp/chain_lats_wsj required_files="$src_mfcc_config $src_mdl $src_lang/phones.txt $src_dict/lexicon.txt $src_tree_dir/tree" @@ -140,18 +139,10 @@ ivec_opt="" if $use_ivector;then ivec_opt="--online-ivector-dir exp/nnet2${nnet_affix}/ivectors" ; fi if [ $stage -le 4 ]; then - echo "$0: Generate alignment using source chain model." - steps/nnet3/align.sh --nj 100 --cmd "$train_cmd" $ivec_opt \ - --extra-left-context-initial 0 --extra-right-context-final 0 \ - --scale-opts "--transition-scale=1.0 --acoustic-scale=1.0 --self-loop-scale=1.0" \ - --frames-per-chunk 150 \ - data/train_hires $lang_src_tgt $src_mdl_dir $ali_dir || exit 1; -fi - -if [ $stage -le 5 ]; then # Get the alignments as lattices (gives the chain training more freedom). # use the same num-jobs as the alignments steps/nnet3/align_lats.sh --nj 100 --cmd "$train_cmd" $ivec_opt \ + --generate-ali-from-lats true \ --acoustic-scale 1.0 --extra-left-context-initial 0 --extra-right-context-final 0 \ --frames-per-chunk 150 \ --scale-opts "--transition-scale=1.0 --self-loop-scale=1.0" \ @@ -159,24 +150,24 @@ if [ $stage -le 5 ]; then rm $lat_dir/fsts.*.gz # save space fi -if [ $stage -le 6 ]; then +if [ $stage -le 5 ]; then # set the learning-rate-factor for initial network to be primary_lr_factor." $train_cmd $dir/log/generate_input_mdl.log \ nnet3-am-copy --raw=true --edits="set-learning-rate-factor name=* learning-rate-factor=$primary_lr_factor; set-learning-rate-factor name=output* learning-rate-factor=1.0" \ $src_mdl $dir/input.raw || exit 1; fi -if [ $stage -le 7 ]; then +if [ $stage -le 6 ]; then echo "$0: compute {den,normalization}.fst using weighted phone LM." steps/nnet3/chain/make_weighted_den_fst.sh --weights $phone_lm_scales \ --lm-opts '--num-extra-lm-states=200' \ - $src_tree_dir $ali_dir $dir || exit 1; + $src_tree_dir $lat_dir $dir || exit 1; fi -if [ $stage -le 8 ]; then +if [ $stage -le 7 ]; then if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then utils/create_split_dir.pl \ - /export/b0{3,4,5,6}/$USER/kaldi-data/egs/rm-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/rm-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage fi # exclude phone_LM and den.fst generation training stage if [ $train_stage -lt -4 ]; then train_stage=-4 ; fi @@ -217,12 +208,7 @@ if [ $stage -le 8 ]; then --dir $dir || exit 1; fi -if [ $stage -le 9 ] && $use_ivector; then - steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 4 \ - data/test_hires $src_ivec_extractor_dir exp/nnet2${nnet_affix}/ivectors_test || exit 1; -fi - -if [ $stage -le 10 ]; then +if [ $stage -le 8 ]; then # Note: it might appear that this $lang directory is mismatched, and it is as # far as the 'topo' is concerned, but this script doesn't read the 'topo' from # the lang directory. diff --git a/egs/rm/s5/local/online/run_nnet2_common.sh b/egs/rm/s5/local/online/run_nnet2_common.sh index 82b7c346aa2..58ac72d374b 100755 --- a/egs/rm/s5/local/online/run_nnet2_common.sh +++ b/egs/rm/s5/local/online/run_nnet2_common.sh @@ -73,4 +73,7 @@ if [ $stage -le 3 ] && [ $ivector_dim -gt 0 ]; then steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 40 \ data/${train_set}_max2 $extractor exp/nnet2${nnet_affix}/ivectors || exit 1; + + steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 10 \ + data/test_hires $extractor exp/nnet2${nnet_affix}/ivectors_test || exit 1; fi diff --git a/egs/wsj/s5/steps/align_fmllr_lats.sh b/egs/wsj/s5/steps/align_fmllr_lats.sh index d0f4675cf83..187d9bf5687 100755 --- a/egs/wsj/s5/steps/align_fmllr_lats.sh +++ b/egs/wsj/s5/steps/align_fmllr_lats.sh @@ -7,7 +7,7 @@ # alignments of alternative pronunciations in them. Mainly intended # as a precursor to CTC training for now. -# Begin configuration section. +# Begin configuration section. stage=0 nj=4 cmd=run.pl @@ -23,6 +23,7 @@ final_beam=20 # For the lattice-generation phase there is no retry-beam. This # gmm-latgen-faster defaults to may help.) boost_silence=1.0 # factor by which to boost silence during alignment. fmllr_update_type=full +generate_ali_from_lats=false # If true, alingments generated from lattices. # End configuration options. echo "$0 $@" # Print the command line for logging @@ -96,7 +97,7 @@ mdl_cmd="gmm-boost-silence --boost=$boost_silence `cat $lang/phones/optional_sil ## because the other scripts write them without transition probs. if [ $stage -le 0 ]; then echo "$0: compiling training graphs" - tra="ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $sdata/JOB/text|"; + tra="ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $sdata/JOB/text|"; $cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log \ compile-train-graphs --read-disambig-syms=$lang/phones/disambig.int $scale_opts $dir/tree $dir/final.mdl $lang/L.fst "$tra" \ "ark:|gzip -c >$dir/fsts.JOB.gz" || exit 1; @@ -140,7 +141,7 @@ if [ $stage -le 3 ]; then # Warning: gmm-latgen-faster doesn't support a retry-beam so you may get more # alignment errors (however, it does have a default min-active=200 so this # will tend to reduce alignment errors). - # --allow_partial=false makes sure we reach the end of the decoding graph. + # --allow_partial=false makes sure we reach the end of the decoding graph. # --word-determinize=false makes sure we retain the alternative pronunciations of # words (including alternatives regarding optional silences). # --lattice-beam=$beam keeps all the alternatives that were within the beam, @@ -154,7 +155,14 @@ if [ $stage -le 3 ]; then "ark:|gzip -c >$dir/lat.JOB.gz" || exit 1; fi -rm $dir/pre_ali.*.gz +if [ $stage -le 4 ] && $generate_ali_from_lats; then + # If generate_alignments is true, ali.*.gz is generated in lats dir + $cmd JOB=1:$nj $dir/log/generate_alignments.JOB.log \ + lattice-best-path --acoustic-scale=$acoustic_scale "ark:gunzip -c $dir/lat.JOB.gz |" \ + ark:/dev/null "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1; +fi + +rm $dir/pre_ali.*.gz 2>/dev/null || true echo "$0: done generating lattices from training transcripts." diff --git a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py index 1b0878ea9a8..b44170149e8 100644 --- a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py +++ b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py @@ -426,14 +426,15 @@ def compute_preconditioning_matrix(dir, egs_dir, num_lda_jobs, run_opts, common_lib.force_symlink("../lda.mat", "{0}/configs/lda.mat".format(dir)) -def prepare_initial_acoustic_model(dir, run_opts, srand=-1, input_mdl=None): - """ Adds the first layer; It will also prepare the acoustic model - with the transition model. - If input_mdl is specified, no initial network preparation(adding - first layer) is done on that and this model is prepared instead of - '0.raw' acoustice model with the transition model. +def prepare_initial_acoustic_model(dir, run_opts, srand=-1, input_model=None): + """ This function adds the first layer; It will also prepare the acoustic + model with the transition model. + If 'input_model' is specified, no initial network preparation(adding + first layer) is done and this model is used initial 'raw' model + instead of '0.raw' model to prepare '0.mdl' acoustic model by adding the + transition model. """ - if input_mdl is None: + if input_model is None: common_train_lib.prepare_initial_network(dir, run_opts, srand=srand) @@ -447,7 +448,7 @@ def prepare_initial_acoustic_model(dir, run_opts, srand=-1, input_mdl=None): """{command} {dir}/log/init_mdl.log \ nnet3-am-init {dir}/0.trans_mdl {raw_mdl} \ {dir}/0.mdl""".format(command=run_opts.command, dir=dir, - raw_mdl=(input_mdl if input_mdl is not None + raw_mdl=(input_model if input_model is not None else '{0}/0.raw'.format(dir)))) diff --git a/egs/wsj/s5/steps/libs/nnet3/train/common.py b/egs/wsj/s5/steps/libs/nnet3/train/common.py index 960d327b72a..273d21e5c94 100644 --- a/egs/wsj/s5/steps/libs/nnet3/train/common.py +++ b/egs/wsj/s5/steps/libs/nnet3/train/common.py @@ -357,13 +357,11 @@ def parse_generic_config_vars_file(var_file): raise Exception('Error while parsing the file {0}'.format(var_file)) -def parse_input_model(input_model): - """ This function parses input_model and outputs left and right contexts - for input_mdl. This function is an alternative to configs/vars, - if this file is not available. - e.g. input_mdl is not generated using - configs and directly passed to train.py - using --trainer.input-model. +def get_input_model_info(input_model): + """ This function returns a dictionary with keys "model_{left/right}_context" + and values equal to the left/right model contexts for input_model. + This function is useful when using the --trainer.input-model option + instead of initializing model using configs. """ variables = {} try: diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py index fde10ba3d00..43c685c01cc 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py @@ -42,7 +42,8 @@ def __init__(self, first_token, key_to_value, all_layers): raise RuntimeError("Invalid value: name={0}".format( key_to_value['name'])) for prev_layer in all_layers: - if self.name == prev_layer.name and prev_layer.layer_type is not 'auxiliary': + if (self.name == prev_layer.name and + prev_layer.layer_type is not 'existing'): raise RuntimeError("Name '{0}' is used for more than one " "layer.".format(self.name)) @@ -1075,19 +1076,24 @@ def get_full_config(self): return ans -class XconfigAuxiliaryLayer(XconfigLayerBase): +class XconfigExistingLayer(XconfigLayerBase): """This class is for lines like - 'auxiliary name=aux dim=40' - in the config file. - This layer contains dim and name. - This class is useful in cases like transferring - existing models and using {input,output,component}-nodes - of that model as input to new layers. + 'existing name=tdnn1.affine dim=40' + This layer contains 'dim' and 'name' and it is not presented in any actual + config files. + Layers of this type are created internally for all component nodes in + an existing neural net model for use as input to other layers. + (i.e. get_model_component_info, which is called in + steps/nnet3/xconfig_to_configs.py, returns a list of 'existing' + layers for component nodes used in 'existing_model') + This class is useful in cases like transferring existing model + and using {input, output, component}-nodes + in the model as input to new layers. """ def __init__(self, first_token, key_to_value, prev_names=None): - assert first_token == 'auxiliary' + assert first_token == 'existing' XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names) @@ -1096,7 +1102,7 @@ def set_default_configs(self): def check_configs(self): if self.config['dim'] <= 0: - raise RuntimeError("Dimension of auxiliary-layer '{0}'" + raise RuntimeError("Dimension of existing-layer '{0}'" "should be positive.".format(self.name)) def get_input_descriptor_names(self): diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py index 96f5e824f1e..63f057148dd 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py @@ -68,15 +68,17 @@ def xconfig_line_to_object(config_line, prev_layers = None): raise # This function reads existing model (*.raw or *.mdl) and returns array of -# XconfigInputLayer one per input-node or component-node with same 'name' used -# in raw model and 'dim' equal to 'output-dim' for component-node and 'dim' for -# input-node. +# XconfigExistingLayer one per {input,output}-node or component-node with same +# 'name' used in raw model and 'dim' equal to 'output-dim' for component-node +# and 'dim' for {input,output}-node. def get_model_component_info(model_filename): all_layers = [] try: f = open(model_filename, 'r') except Exception as e: - sys.exit("{0}: error reading model file '{1}'".format(sys.argv[0], model_filename, repr(e))) + sys.exit("{0}: error reading model file '{1}'".format(sys.argv[0], + model_filename, + repr(e))) # use nnet3-info to get component names in the model. out = common_lib.get_command_stdout("""nnet3-info {0} | grep '\-node' """ @@ -85,12 +87,12 @@ def get_model_component_info(model_filename): # out contains all {input,component}-nodes used in model_filename # It can parse lines in out like: # i.e. input-node name=input dim=40 - # component-node name=tdnn1.affine component=tdnn1.affine input=lda input-dim=300 output-dim=512 + # component-node name=tdnn1.affine component=tdnn1.affine input=lda + # input-dim=300 output-dim=512 layer_names = [] for line in out.split("\n"): parts = line.split(" ") - input_dim = -1 - output_dim = -1 + dim = -1 for field in parts: key_value = field.split("=") if len(key_value) == 2: @@ -99,27 +101,17 @@ def get_model_component_info(model_filename): if key == "name": # name=** layer_name = value elif key == "dim": # for input-node - input_dim = int(value) - elif key == "input-dim": # for component-node - input_dim = int(value) + dim = int(value) elif key == "output-dim": # for component-node - output_dim = int(value) - elif key == "input": # for component-node i.e. input=lda - input_str = value + dim = int(value) if layer_name is not None and layer_name not in layer_names: key_to_value = dict() layer_names.append(layer_name) key_to_value['name'] = layer_name - assert(input_dim != -1) - if output_dim == -1: - # The layer type is input-node. - key_to_value['dim'] = input_dim - else: - # The layer type is component-node - assert(input_str is not None) - key_to_value['dim'] = output_dim - all_layers.append(xlayers.XconfigAuxiliaryLayer('auxiliary', key_to_value, all_layers)) + assert(dim != -1) + key_to_value['dim'] = dim + all_layers.append(xlayers.XconfigExistingLayer('existing', key_to_value, all_layers)) if len(all_layers) == 0: raise RuntimeError("{0}: model filename '{1}' is empty.".format( sys.argv[0], model_filename)) @@ -127,13 +119,15 @@ def get_model_component_info(model_filename): return all_layers -# This function reads an xconfig file and returns it as a list of layers +# This function reads xconfig file and returns it as a list of layers # (usually we use the variable name 'all_layers' elsewhere for this). # It will die if the xconfig file is empty or if there was # some error parsing it. -# aux_layers is a list of auxilary layers({component,input,output}-node) -# can be used as input to component-nodes used in xconfig_file. -def read_xconfig_file(xconfig_filename, aux_layers=[]): +# 'existing_layers' contains some 'existing' layers (layers which are not really +# layers but are actual component node names from an existing neural net model +# and created using get_model_component_info function.) +# that can be used as input to component-nodes in layers xconfig file. +def read_xconfig_file(xconfig_filename, existing_layers=[]): try: f = open(xconfig_filename, 'r') except Exception as e: @@ -146,11 +140,11 @@ def read_xconfig_file(xconfig_filename, aux_layers=[]): break # the next call will raise an easy-to-understand exception if # it fails. - this_layer = xconfig_line_to_object(line, aux_layers) + this_layer = xconfig_line_to_object(line, existing_layers) if this_layer is None: continue # line was blank after removing comments. all_layers.append(this_layer) - aux_layers.append(this_layer) + existing_layers.append(this_layer) if len(all_layers) == 0: raise RuntimeError("{0}: xconfig file '{1}' is empty".format( sys.argv[0], xconfig_filename)) diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/utils.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/utils.py index 0cd22975c26..dde19ab02c6 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/utils.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/utils.py @@ -14,7 +14,8 @@ # Given a list of objects of type XconfigLayerBase ('all_layers'), # including at least the layers preceding 'current_layer' (and maybe # more layers), return the names of layers preceding 'current_layer' -# other than layers of type 'auxiliary'. +# other than layers of type 'existing', which correspond to component-node +# names from an existing model that we are adding layers to them. # This will be used in parsing expressions like [-1] in descriptors # (which is an alias for the previous layer). def get_prev_names(all_layers, current_layer): @@ -22,7 +23,7 @@ def get_prev_names(all_layers, current_layer): for layer in all_layers: if layer is current_layer: break - if layer.layer_type is not 'auxiliary': + if layer.layer_type is not 'existing': prev_names.append(layer.get_name()) prev_names_set = set() for name in prev_names: @@ -35,7 +36,6 @@ def get_prev_names(all_layers, current_layer): # This is a convenience function to parser the auxiliary output name from the # full layer name - def split_layer_name(full_layer_name): assert isinstance(full_layer_name, str) split_name = full_layer_name.split('.') @@ -62,13 +62,26 @@ def get_dim_from_layer_name(all_layers, current_layer, full_layer_name): for layer in all_layers: if layer is current_layer: break - # if "." used in layer name like tdnn.1 + # If 'all_layers' contains some 'existing' layers, i.e. layers which + # are not really layers but are actual component names from an existing + # neural net that we are adding components to, they may already be + # of the form 'xxx.yyy', e.g. 'tdnn1.affine'. In this case the name of + # the layer in 'all_layers' won't be just the 'xxx' part (e.g. 'tdnn1'), + # it will be the full thing, like 'tdnn1.affine'. + # We will also use the if-statement immediately below this comment for + # regular layers, e.g. where full_layer_name is something like 'tdnn2'. + # The if-statement below the next one, that uses + # auxiliary_output, will only be used in the (rare) case when we are + # using auxiliary outputs, e.g. 'lstm1.c'. if layer.get_name() == full_layer_name: return layer.output_dim() if layer.get_name() == layer_name: - if not auxiliary_output in layer.auxiliary_outputs() and auxiliary_output is not None: - raise RuntimeError("Layer '{0}' has no such auxiliary output: '{1}' ({0}.{1})".format(layer_name, auxiliary_output)) + if (not auxiliary_output in layer.auxiliary_outputs() + and auxiliary_output is not None): + raise RuntimeError("Layer '{0}' has no such auxiliary output:" + "'{1}' ({0}.{1})".format(layer_name, + auxiliary_output)) return layer.output_dim(auxiliary_output) # No such layer was found. if layer_name in [ layer.get_name() for layer in all_layers ]: @@ -91,13 +104,23 @@ def get_string_from_layer_name(all_layers, current_layer, full_layer_name): for layer in all_layers: if layer is current_layer: break - # full_layer_name with "." + # The following if-statement is needed to handle the case where the + # layer is an 'existing' layer, derived from an existing trained + # neural network supplied via the --existing-model option, that we are + # adding layers to. In this case the name of the layer will actually + # be of the form xxx.yyy, e.g. 'tdnn1.affine'. + # The code path will also be taken for regular (non-'existing') layer + # names where the 'auxiliary_output' field is not used, which is actually + # the normal case (e.g. when 'full_layer_name' is 'lstm1', + # as opposed to, say, 'lstm1.c' if layer.get_name() == full_layer_name: return layer.output_name() if layer.get_name() == layer_name: - if not auxiliary_output in layer.auxiliary_outputs() and auxiliary_output is not None: - raise RuntimeError("Layer '{0}' has no such auxiliary output: '{1}' ({0}.{1})".format( + if (not auxiliary_output in layer.auxiliary_outputs() and + auxiliary_output is not None): + raise RuntimeError("Layer '{0}' has no such auxiliary output: " + "'{1}' ({0}.{1})".format( layer_name, auxiliary_output)) return layer.output_name(auxiliary_output) # No such layer was found. diff --git a/egs/wsj/s5/steps/nnet3/align_lats.sh b/egs/wsj/s5/steps/nnet3/align_lats.sh index f931b826b7b..35f171dc5e4 100755 --- a/egs/wsj/s5/steps/nnet3/align_lats.sh +++ b/egs/wsj/s5/steps/nnet3/align_lats.sh @@ -5,7 +5,7 @@ # 2016 Vimal Manohar # Apache 2.0 -# Computes training alignments using nnet3 DNN +# Computes training alignments using nnet3 DNN, with output to lattices. # Begin configuration section. nj=4 @@ -24,6 +24,7 @@ extra_left_context_initial=-1 extra_right_context_final=-1 online_ivector_dir= graphs_scp= +generate_ali_from_lats=false # If true, alingments generated from lattices. # End configuration options. echo "$0 $@" # Print the command line for logging @@ -166,4 +167,10 @@ if [ $stage -le 1 ]; then "$feats" "ark:|gzip -c >$dir/lat.JOB.gz" || exit 1; fi -echo "$0: done aligning data." +if [ $stage -le 2 ] && $generate_ali_from_lats; then + # If generate_alignments is true, ali.*.gz is generated in lats dir + $cmd JOB=1:$nj $dir/log/generate_alignments.JOB.log \ + lattice-best-path --acoustic-scale=$acoustic_scale "ark:gunzip -c $dir/lat.JOB.gz |" \ + ark:/dev/null "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1; +fi +echo "$0: done generating lattices from training transcripts." diff --git a/egs/wsj/s5/steps/nnet3/chain/make_weighted_den_fst.sh b/egs/wsj/s5/steps/nnet3/chain/make_weighted_den_fst.sh index bbac3807e9d..47e7c73a9f4 100755 --- a/egs/wsj/s5/steps/nnet3/chain/make_weighted_den_fst.sh +++ b/egs/wsj/s5/steps/nnet3/chain/make_weighted_den_fst.sh @@ -16,11 +16,11 @@ # limitations under the License. # This script creates denominator FST (den.fst) and normalization.fst for -# chain training. It additional copies the transition model and tree from the +# chain training. It additionally copies the transition model and tree from the # first alignment directory to the chain directory. # This script can accept multiple sources of alignments with same phone sets # that can be weighted to estimate phone LM. -# Each alignment directory should contain tree, final,mdl and ali.*.gz. +# Each alignment directory should contain tree, final.mdl and ali.*.gz. set -o pipefail @@ -29,6 +29,10 @@ cmd=run.pl stage=-10 weights= # comma-separated list of integer valued scale weights used # to scale different phone sequences for different alignments. + # Scaling the count with i^th int weight 'w' is done by repeating + # the i^th phone sequence 'w' times. + # If not specified, weight '1' is used for all phone sequences. + lm_opts='num_extra_lm_state=2000' #end configuration section. @@ -65,20 +69,33 @@ done cp ${ali_dirs[0]}/tree $dir/ || exit 1 +if [ -z $weights ]; then + # If 'weights' is not specified, comma-separated array '1' with dim + #'num_alignments' is specified as 'weights'. + for n in `seq 1 $num_alignments`;do weights="$weights,1"; done +fi + if [ $stage -le 1 ]; then for n in `seq 0 $[num_alignments-1]`; do w=$(echo $weights | cut -d, -f$[$n+1]) + adir=${ali_dirs[$n]} + num_jobs=$(cat $adir/num_jobs) if ! [[ $w =~ ^[+]?[0-9]+$ ]] ; then echo "no positive int weight specified for alignment ${ali_dirs[$n]}" && exit 1; fi + rm $adir/alignment_files.txt 2>/dev/null || true + for x in `seq $w`;do + for j in `seq $num_jobs`;do + echo $adir/ali.$j.gz >> $adir/alignment_files.txt + done + done done $cmd $dir/log/make_phone_lm_fst.log \ ali_dirs=\(${ali_dirs[@]}\) \; \ for n in `seq 0 $[num_alignments-1]`\; do \ adir=\${ali_dirs[\$n]} \; \ - w=\$\(echo $weights \| cut -d, -f\$[\$n+1]\) \; \ - for x in \$\(seq \$w\)\; do gunzip -c \$adir/ali.*.gz \; done \| \ - ali-to-phones \$adir/final.mdl ark:- ark:- \; \ + cat \$adir/alignment_files.txt \| while read f\; do gunzip -c \$f \; done \| \ + ali-to-phones \$adir/final.mdl ark:- ark:- \; \ done \| \ chain-est-phone-lm $lm_opts ark:- $dir/phone_lm.fst || exit 1; fi diff --git a/egs/wsj/s5/steps/nnet3/chain/train.py b/egs/wsj/s5/steps/nnet3/chain/train.py index ec70a3f4408..06fa12092f4 100755 --- a/egs/wsj/s5/steps/nnet3/chain/train.py +++ b/egs/wsj/s5/steps/nnet3/chain/train.py @@ -104,10 +104,11 @@ def get_args(): parser.add_argument("--trainer.input-model", type=str, dest='input_model', default=None, action=common_lib.NullstrToNoneAction, - help="If specified, this model is used as 0.raw model " - "and no LDA matrix or init.raw initialzed." + help="If specified, this model is used as the initial " + "'raw' model (0.raw in the script) instead of " + "initializing the model from the xconfig. " "Also configs dir is not expected to exist " - "and context is generated using this model.") + "and left/right context is computed from this model.") parser.add_argument("--trainer.num-epochs", type=float, dest='num_epochs', default=10.0, help="Number of epochs to train the model") @@ -217,11 +218,12 @@ def process_args(args): if (not os.path.exists(args.dir) or (not os.path.exists(args.dir+"/configs") and not os.path.exists(args.input_model))): - raise Exception("This scripts expects {0} to exist. Also either of --trainer.input-model " - " as '0.raw' model should exist or {0} should have a configs " - "directory which is the output of " - "make_configs.py script.".format( - args.dir, args.input_model)) + raise Exception("This script expects {0} to exist. Also either " + "--trainer.input-model option as initial 'raw' model " + "(used as 0.raw in the script) should be supplied or " + "{0}/configs directory which is the output of " + "make_configs.py script should be provided.".format( + args.dir)) if args.transform_dir is None: args.transform_dir = args.lat_dir @@ -290,9 +292,9 @@ def train(args, run_opts): variables = common_train_lib.parse_generic_config_vars_file(var_file) else: - # if args.input_model specified, the model left and right context - # computed using input_model. - variables = common_train_lib.parse_input_model(args.input_model) + # If args.input_model is specified, the model left and right contexts + # are computed using input_model. + variables = common_train_lib.get_input_model_info(args.input_model) # Set some variables. try: @@ -398,7 +400,8 @@ def train(args, run_opts): logger.info("Copying the properties from {0} to {1}".format(egs_dir, args.dir)) common_train_lib.copy_egs_properties_to_exp_dir(egs_dir, args.dir) - if (args.stage <= -2) and os.path.exists(args.dir+"/configs/init.config") and args.input_model is None: + if ((args.stage <= -2) and (os.path.exists(args.dir+"/configs/init.config")) + and (args.input_model is None)): logger.info('Computing the preconditioning matrix for input features') chain_lib.compute_preconditioning_matrix( @@ -408,7 +411,8 @@ def train(args, run_opts): if (args.stage <= -1): logger.info("Preparing the initial acoustic model.") - chain_lib.prepare_initial_acoustic_model(args.dir, run_opts, input_mdl=args.input_model) + chain_lib.prepare_initial_acoustic_model(args.dir, run_opts, + input_model=args.input_model) with open("{0}/frame_subsampling_factor".format(args.dir), "w") as f: f.write(str(args.frame_subsampling_factor)) diff --git a/egs/wsj/s5/steps/nnet3/xconfig_to_configs.py b/egs/wsj/s5/steps/nnet3/xconfig_to_configs.py index 3c81ec12b95..a9325079450 100755 --- a/egs/wsj/s5/steps/nnet3/xconfig_to_configs.py +++ b/egs/wsj/s5/steps/nnet3/xconfig_to_configs.py @@ -314,10 +314,10 @@ def check_model_contexts(config_dir, nnet_edits=None, existing_model=None): def main(): args = get_args() backup_xconfig_file(args.xconfig_file, args.config_dir) - aux_layers = [] + existing_layers = [] if args.existing_model is not None: - aux_layers = xparser.get_model_component_info(args.existing_model) - all_layers = xparser.read_xconfig_file(args.xconfig_file, aux_layers) + existing_layers = xparser.get_model_component_info(args.existing_model) + all_layers = xparser.read_xconfig_file(args.xconfig_file, existing_layers) write_expanded_xconfig_files(args.config_dir, all_layers) write_config_files(args.config_dir, all_layers) check_model_contexts(args.config_dir, args.nnet_edits, From eb009839fc344e1b3b4a6152f5bafee47b61843a Mon Sep 17 00:00:00 2001 From: Pegita Date: Wed, 23 Aug 2017 20:13:23 -0400 Subject: [PATCH 047/174] small fix. --- egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1b.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1b.sh b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1b.sh index fe3d8f267dd..6f8a37fb265 100755 --- a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1b.sh +++ b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1b.sh @@ -142,7 +142,7 @@ if [ $stage -le 4 ]; then # use the same num-jobs as the alignments nj=$(cat exp/tri3b_ali/num_jobs) || exit 1; steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" \ - --generate-ali-from-lats true --stage 4 \ + --generate-ali-from-lats true \ data/train $lang_src_tgt $src_gmm_dir $lat_dir || exit 1; rm $lat_dir/fsts.*.gz 2>/dev/null || true # save space fi From ef7275be87d438e147a5753a182e3e23529be6aa Mon Sep 17 00:00:00 2001 From: Pegita Date: Thu, 24 Aug 2017 15:26:04 -0400 Subject: [PATCH 048/174] fixed old comments and added new comments. --- egs/rm/s5/RESULTS | 2 +- .../local/chain/tuning/run_tdnn_wsj_rm_1a.sh | 60 +++++++++++-------- .../local/chain/tuning/run_tdnn_wsj_rm_1b.sh | 44 +++++++------- .../local/chain/tuning/run_tdnn_wsj_rm_1c.sh | 35 ++++++----- egs/rm/s5/local/online/run_nnet2_common.sh | 3 +- egs/rm/s5/local/prepare_wsj_rm_lang.sh | 17 +++--- .../nnet3/train/chain_objf/acoustic_model.py | 4 +- egs/wsj/s5/steps/libs/nnet3/train/common.py | 7 ++- .../nnet3/train/frame_level_objf/common.py | 1 + .../steps/libs/nnet3/xconfig/basic_layers.py | 15 ++++- egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py | 13 +++- egs/wsj/s5/steps/libs/nnet3/xconfig/utils.py | 2 + egs/wsj/s5/steps/nnet3/align_lats.sh | 1 + .../nnet3/chain/make_weighted_den_fst.sh | 14 ++++- egs/wsj/s5/steps/nnet3/chain/train.py | 17 +++--- 15 files changed, 143 insertions(+), 92 deletions(-) diff --git a/egs/rm/s5/RESULTS b/egs/rm/s5/RESULTS index 368abd2751f..a8156e10e14 100644 --- a/egs/rm/s5/RESULTS +++ b/egs/rm/s5/RESULTS @@ -235,7 +235,7 @@ for x in exp/nnet2_online_wsj/nnet_ms_a_smbr_0.00005/1/decode_*; do grep WER $x/ %WER 2.71 [ 340 / 12533, 58 ins, 59 del, 223 sub ] exp/chain/tdnn_5n/decode/wer_4_0.0 ### WSJ->RM Transfer learning using chain model ### -%WER 2.21 [ 277 / 12533, 42 ins, 45 del, 190 sub ] exp/chain/tdnn_wsj_rm/decode/wer_3_0.5 +%WER 1.68 [ 210 / 12533, 25 ins, 33 del, 152 sub ] exp/chain/tdnn_wsj_rm_1a/decode/wer_2_0.0 ### nnet1 results ### diff --git a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1a.sh b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1a.sh index b6852f0f812..2ede5364fed 100755 --- a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1a.sh +++ b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1a.sh @@ -1,10 +1,14 @@ #!/bin/bash -# This script uses weight transfer as a Transfer learning method to transfer -# model trained on wsj to rm dataset. -# It uses already trained model on wsj and removes its last layer and -# adds new randomly initialized layer and retrains the whole network with -# smaller learning-rate, while training new added layer using rm data. +# This script uses weight transfer as a transfer learning method to transfer +# already trained neural net model on wsj to rm. +# +# Model preparation: The last layer (prefinal and output layer) from +# already-trained wsj model is removed and 3 randomly initialized layer +# (new tdnn layer, prefinal, and output) are added to the model. +# +# Training: The transferred layers are retrained with smaller learning-rate, +# while new added layers are trained with larger learning rate using rm data. # The chain config is as in run_tdnn_5n.sh and the result is: #System tdnn_5n tdnn_wsj_rm_1a #WER 2.71 1.68 @@ -18,20 +22,21 @@ dir=exp/chain/tdnn_wsj_rm_1a xent_regularize=0.1 # configs for transfer learning -src_mdl=../../wsj/s5/exp/chain/tdnn1d_sp/final.mdl # input chain model - # trained on source dataset (wsj). - # This model is transfered to the target domain. +src_mdl=../../wsj/s5/exp/chain/tdnn1d_sp/final.mdl # Input chain model + # trained on source dataset (wsj). + # This model is transfered to the target domain. src_mfcc_config=../../wsj/s5/conf/mfcc_hires.conf # mfcc config used to extract higher dim - # mfcc features for ivector training - # in source domain. -src_ivec_extractor_dir= # source ivector extractor dir used to extract ivector for - # source data and the ivector for target data is extracted using this extractor. - # It should be nonempty, if ivector is used in source model training. + # mfcc features for ivector and DNN training + # in the source domain. +src_ivec_extractor_dir= # Source ivector extractor dir used to extract ivector for + # source data. The ivector for target data is extracted using this extractor. + # It should be nonempty, if ivector is used in the source model training. common_egs_dir= primary_lr_factor=0.25 # The learning-rate factor for transferred layers from source - # model. e.g. if 0, it fixed the paramters transferred from source. + # model. e.g. if 0, the paramters transferred from source model + # are fixed. # The learning-rate factor for new added layers is 1.0. nnet_affix=_online_wsj @@ -50,6 +55,7 @@ If you want to use GPUs (and have them), go to src/, and configure and make on a where "nvcc" is installed. EOF fi + required_files="$src_mfcc_config $src_mdl" use_ivector=false ivector_dim=$(nnet3-am-info --print-args=false $src_mdl | grep "ivector-dim" | cut -d" " -f2) @@ -57,20 +63,22 @@ if [ "$ivector_dim" == "" ]; then ivector_dim=0 ; fi if [ ! -z $src_ivec_extractor_dir ]; then if [ $ivector_dim -eq 0 ]; then - echo "source ivector extractor dir '$src_ivec_extractor_dir' is specified but ivector is not used in training the source model '$src_mdl'." + echo "$0: Source ivector extractor dir '$src_ivec_extractor_dir' is specified " + echo "but ivector is not used in training the source model '$src_mdl'." else required_files="$required_files $src_ivec_extractor_dir/final.dubm $src_ivec_extractor_dir/final.mat $src_ivec_extractor_dir/final.ie" use_ivector=true fi else if [ $ivector_dim -gt 0 ]; then - echo "ivector is used in training the source model '$src_mdl' but no ivector extractor dir for source model specified." && exit 1; + echo "$0: ivector is used in training the source model '$src_mdl' but no " + echo "ivector extractor dir for source model is specified." && exit 1; fi fi for f in $required_files; do if [ ! -f $f ]; then - echo "$0: no such file $f" + echo "$0: no such file $f." fi done @@ -91,10 +99,10 @@ local/online/run_nnet2_common.sh --stage $stage \ if [ $stage -le 4 ]; then # Get the alignments as lattices (gives the chain training more freedom). # use the same num-jobs as the alignments - nj=$(cat exp/tri3b_ali/num_jobs) || exit 1; + nj=$(cat $ali_dir/num_jobs) || exit 1; steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/train \ - data/lang exp/tri3b exp/tri3b_lats - rm exp/tri3b_lats/fsts.*.gz # save space + data/lang exp/tri3b exp/tri3b_lats || exit 1; + rm exp/tri3b_lats/fsts.*.gz 2>/dev/null || true # save space fi if [ $stage -le 5 ]; then @@ -114,7 +122,7 @@ if [ $stage -le 6 ]; then # Build a tree using our new topology. steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ --leftmost-questions-truncate -1 \ - --cmd "$train_cmd" 1200 data/train $lang $ali_dir $treedir + --cmd "$train_cmd" 1200 data/train $lang $ali_dir $treedir || exit 1; fi if [ $stage -le 7 ]; then @@ -125,19 +133,19 @@ if [ $stage -le 7 ]; then mkdir -p $dir mkdir -p $dir/configs cat < $dir/configs/network.xconfig - relu-renorm-layer name=tdnn7-target input=Append(tdnn6.renorm@-3,tdnn6.renorm@0) dim=450 + relu-renorm-layer name=tdnn-target input=Append(tdnn6.renorm@-3,tdnn6.renorm) dim=450 ## adding the layers for chain branch - relu-renorm-layer name=prefinal-chain input=tdnn7-target dim=450 target-rms=0.5 + relu-renorm-layer name=prefinal-chain input=tdnn-target dim=450 target-rms=0.5 output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 - relu-renorm-layer name=prefinal-xent input=tdnn7-target dim=450 target-rms=0.5 + relu-renorm-layer name=prefinal-xent input=tdnn-target dim=450 target-rms=0.5 output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 EOF steps/nnet3/xconfig_to_configs.py --existing-model $src_mdl \ --xconfig-file $dir/configs/network.xconfig \ --config-dir $dir/configs/ - # Set the learning-rate-factor to be primary_lr_factor for initial network." - # and add new layer to initial model + # Set the learning-rate-factor to be primary_lr_factor for transferred layers " + # and adding new layers to them. $train_cmd $dir/log/generate_input_mdl.log \ nnet3-copy --edits="set-learning-rate-factor name=* learning-rate-factor=$primary_lr_factor" $src_mdl - \| \ nnet3-init --srand=1 - $dir/configs/final.config $dir/input.raw || exit 1; diff --git a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1b.sh b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1b.sh index 6f8a37fb265..40e026b0552 100755 --- a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1b.sh +++ b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1b.sh @@ -1,25 +1,26 @@ #!/bin/bash # _1b is as _1a, but different as follows -# 1) uses wsj phone set phones.txt and new lexicon generated using word pronunciation -# in swj lexincon.txt and rm words not presented in wsj are added as oov +# 1) It uses wsj phone set phones.txt and new lexicon generated using word pronunciation +# in swj lexincon.txt. rm words, that are not presented in wsj, are added as oov # in new lexicon.txt. -# 2) It uses wsj tree-dir and generates new rm alignments and lattices using +# 2) It uses wsj tree-dir and generates new alignments and lattices for rm using # wsj gmm model. -# 3) It also train phone LM using weighted combination of alignemts from wsj +# 3) It also trains phone LM using weighted combination of alignemts from wsj # and rm, which is used in chain denominator graph. # Since we use phone.txt from source dataset, this can be helpful in cases -# where there is few training data in target and some 4-gram phone sequences -# have no count in target. -# 4) It does not replace the output layer from already-trained model with new -# randomely initialized output layer and and re-train it using target dataset. +# where there is few training data in the target domain and some 4-gram phone +# sequences have no count in the target domain. +# 4) It uses whole already-trained model and does not replace the output layer +# from already-trained model with new randomely initialized output layer and +# re-train it using target dataset. -# This script uses weight transfer as Transfer learning method +# This script uses weight transfer as a transfer learning method # and use already trained model on wsj and fine-tune the whole network using rm data -# while training the last layer with higher learning-rate. +# while training the last layer (output layer) with higher learning-rate. # The chain config is as run_tdnn_5n.sh and the result is: # System tdnn_5n tdnn_wsj_rm_1a tdnn_wsj_rm_1b tdnn_wsj_rm_1c -# WER 2.71 1.68 3.45 3.38 +# WER 2.71 1.68 3.56 3.54 set -e # configs for 'chain' @@ -34,23 +35,23 @@ primary_lr_factor=0.25 # The learning-rate factor for transferred layers from so # model. e.g. if 0, it fixed the paramters transferred from source. # The learning-rate factor for new added layers is 1.0. nnet_affix=_online_wsj -phone_lm_scales="1,10" # comma-separated list of integer valued scale weights +phone_lm_scales="1,10" # comma-separated list of positive int valued scale weights # to scale different phone sequences for different alignments # e.g. (src-weight,target-weight)=(1,10) # model and dirs for source model used for transfer learning -src_mdl=../../wsj/s5/exp/chain/tdnn1d_sp/final.mdl # input chain model +src_mdl=../../wsj/s5/exp/chain/tdnn1d_sp/final.mdl # Input chain model # trained on source dataset (wsj). # This model is transfered to the target domain. src_mfcc_config=../../wsj/s5/conf/mfcc_hires.conf # mfcc config used to extract higher dim - # mfcc features for ivector training - # in source domain. -src_ivec_extractor_dir= # source ivector extractor dir used to extract ivector for + # mfcc features for ivector and DNN training + # in the source domain. +src_ivec_extractor_dir= # Source ivector extractor dir used to extract ivector for # source data and the ivector for target data is extracted using this extractor. # It should be nonempty, if ivector is used in source model training. -src_lang=../../wsj/s5/data/lang # source lang directory used to train source model. +src_lang=../../wsj/s5/data/lang # Source lang directory used to train source model. # new lang dir for transfer learning experiment is prepared # using source phone set phones.txt and lexicon.txt # in src lang and dict dirs and words.txt in target lang dir. @@ -103,14 +104,16 @@ if [ "$ivector_dim" == "" ]; then ivector_dim=0 ; fi if [ ! -z $src_ivec_extractor_dir ]; then if [ $ivector_dim -eq 0 ]; then - echo "source ivector extractor dir '$src_ivec_extractor_dir' is specified but ivector is not used in training the source model '$src_mdl'." + echo "$0: Source ivector extractor dir '$src_ivec_extractor_dir' is specified " + echo "but ivector is not used in training the source model '$src_mdl'." else required_files="$required_files $src_ivec_extractor_dir/final.dubm $src_ivec_extractor_dir/final.mat $src_ivec_extractor_dir/final.ie" use_ivector=true fi else if [ $ivector_dim -gt 0 ]; then - echo "ivector is used in training the source model '$src_mdl' but no ivector extractor dir for source model is specified." && exit 1; + echo "$0: ivector is used in training the source model '$src_mdl' but no " + echo " ivector extractor dir for source model is specified." && exit 1; fi fi @@ -148,7 +151,8 @@ if [ $stage -le 4 ]; then fi if [ $stage -le 5 ]; then - # set the learning-rate-factor for initial network to be primary_lr_factor." + # Set the learning-rate-factor for all transferred layers but the last output + # layer to primary_lr_factor. $train_cmd $dir/log/generate_input_mdl.log \ nnet3-am-copy --raw=true --edits="set-learning-rate-factor name=* learning-rate-factor=$primary_lr_factor; set-learning-rate-factor name=output* learning-rate-factor=1.0" \ $src_mdl $dir/input.raw || exit 1; diff --git a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1c.sh b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1c.sh index 0c70d0970ea..34b91e2fd13 100755 --- a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1c.sh +++ b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1c.sh @@ -3,26 +3,26 @@ # to generate alignments for RM using SWJ model. # _1b is as _1a, but different as follows -# 1) uses src phone set phones.txt and new lexicon generated using word pronunciation -# in src lexincon.txt and target word not presented in src are added as oov -# in lexicon.txt. -# 2) It uses src tree-dir and generates new target alignment and lattices using -# src gmm model. -# 3) It also train phone LM using weighted combination of alignemts from source -# and target, which is used in chain denominator graph. +# 1) It uses wsj phone set phones.txt and new lexicon generated using word pronunciation +# in swj lexincon.txt. rm words, that are not presented in wsj, are added as oov +# in new lexicon.txt. +# 2) It uses wsj tree-dir and generates new alignments and lattices for rm using +# wsj gmm model. +# 3) It also trains phone LM using weighted combination of alignemts from wsj +# and rm, which is used in chain denominator graph. # Since we use phone.txt from source dataset, this can be helpful in cases -# where there is few training data in target and some 4-gram phone sequences -# have no count in target. -# 4) It does not replace the output layer from already-trained model with new -# randomely initialized output layer and and re-train it using target dataset. - +# where there is few training data in the target domain and some 4-gram phone +# sequences have no count in the target domain. +# 4) It uses whole already-trained model and does not replace the output layer +# from already-trained model with new randomely initialized output layer and +# re-train it using target dataset. # This script uses weight transfer as Transfer learning method # and use already trained model on wsj and fine-tune the whole network using # rm data while training the last layer with higher learning-rate. # The chain config is as run_tdnn_5n.sh and the result is: # System tdnn_5n tdnn_wsj_rm_1a tdnn_wsj_rm_1b tdnn_wsj_rm_1c -# WER 2.71 1.68 3.45 3.38 +# WER 2.71 1.68 3.56 3.54 set -e @@ -101,14 +101,16 @@ if [ "$ivector_dim" == "" ]; then ivector_dim=0 ; fi if [ ! -z $src_ivec_extractor_dir ]; then if [ $ivector_dim -eq 0 ]; then - echo "source ivector extractor dir '$src_ivec_extractor_dir' is specified but ivector is not used in training the source model '$src_mdl'." + echo "$0: Source ivector extractor dir '$src_ivec_extractor_dir' is " + echo "specified but ivector is not used in training the source model '$src_mdl'." else required_files="$required_files $src_ivec_extractor_dir/final.dubm $src_ivec_extractor_dir/final.mat $src_ivec_extractor_dir/final.ie" use_ivector=true fi else if [ $ivector_dim -gt 0 ]; then - echo "ivector is used in training the source model '$src_mdl' but no ivector extractor dir for source model specified." && exit 1; + echo "$0: ivector is used in training the source model '$src_mdl' but no " + echo " ivector extractor dir for source model is specified." && exit 1; fi fi @@ -151,7 +153,8 @@ if [ $stage -le 4 ]; then fi if [ $stage -le 5 ]; then - # set the learning-rate-factor for initial network to be primary_lr_factor." + # Set the learning-rate-factor for all transferred layers but the last output + # layer to primary_lr_factor. $train_cmd $dir/log/generate_input_mdl.log \ nnet3-am-copy --raw=true --edits="set-learning-rate-factor name=* learning-rate-factor=$primary_lr_factor; set-learning-rate-factor name=output* learning-rate-factor=1.0" \ $src_mdl $dir/input.raw || exit 1; diff --git a/egs/rm/s5/local/online/run_nnet2_common.sh b/egs/rm/s5/local/online/run_nnet2_common.sh index 58ac72d374b..86346339f62 100755 --- a/egs/rm/s5/local/online/run_nnet2_common.sh +++ b/egs/rm/s5/local/online/run_nnet2_common.sh @@ -1,5 +1,6 @@ #!/bin/bash - +# This script extracts mfcc features using mfcc_config and train ubm model and +# ivector extractor and extract ivector for train and test. . cmd.sh diff --git a/egs/rm/s5/local/prepare_wsj_rm_lang.sh b/egs/rm/s5/local/prepare_wsj_rm_lang.sh index 8eeb45a90d6..fd8cb958925 100755 --- a/egs/rm/s5/local/prepare_wsj_rm_lang.sh +++ b/egs/rm/s5/local/prepare_wsj_rm_lang.sh @@ -1,13 +1,14 @@ #!/bin/bash # Copyright 2017 Pegah Ghahremani -# This script prepares a dictionary for wsj to rm transfer learning experiment -# which uses wsj phone set, lexicon and dict and -# the lexicon for rm words.txt are copied from wsj lexicon for common words in wsj -# and rm. words in rm that are not in the wsj lexicon are added -# as oov in lexicon.txt. -# The oov word "" in wsj is also added to words.txt and G.fst is recompiled using -# updated word list. +# This script prepares a dictionary for wsj-to-rm transfer learning experiment, +# which uses wsj phone set phones.txt, lexicon lexicon.txt and dict. +# The new lexicon.txt are created for words in rm words.txt as follows: +# 1) The lexicon are copied from wsj lexicon.txt for common words in wsj and rm. +# 2) Words in rm that are not in the wsj lexicon are added +# as oov to new lexicon.txt. +# The oov word "" in wsj is also added to words.txt and G.fst is +# recompiled using updated word list. if [ -f path.sh ]; then . ./path.sh; fi . utils/parse_options.sh @@ -25,7 +26,7 @@ output_dir=$3 required_dict_files="$src_dict/lexicon.txt $src_dict/nonsilence_phones.txt $src_dict/silence_phones.txt $src_dict/optional_silence.txt $src_lang/oov.txt $src_lang/phones.txt" for f in $required_dict_files; do if [ ! -f $f ]; then - echo "file $f that is required for preparing lang does not exists." && exit 1; + echo "$0: file $f that is required for preparing lang does not exist." && exit 1; fi done diff --git a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py index b44170149e8..c8dc64b8481 100644 --- a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py +++ b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py @@ -430,8 +430,8 @@ def prepare_initial_acoustic_model(dir, run_opts, srand=-1, input_model=None): """ This function adds the first layer; It will also prepare the acoustic model with the transition model. If 'input_model' is specified, no initial network preparation(adding - first layer) is done and this model is used initial 'raw' model - instead of '0.raw' model to prepare '0.mdl' acoustic model by adding the + the first layer) is done and this model is used as initial 'raw' model + instead of '0.raw' model to prepare '0.mdl' as acoustic model by adding the transition model. """ if input_model is None: diff --git a/egs/wsj/s5/steps/libs/nnet3/train/common.py b/egs/wsj/s5/steps/libs/nnet3/train/common.py index 273d21e5c94..75065875c91 100644 --- a/egs/wsj/s5/steps/libs/nnet3/train/common.py +++ b/egs/wsj/s5/steps/libs/nnet3/train/common.py @@ -358,10 +358,11 @@ def parse_generic_config_vars_file(var_file): def get_input_model_info(input_model): - """ This function returns a dictionary with keys "model_{left/right}_context" - and values equal to the left/right model contexts for input_model. + """ This function returns a dictionary with keys "model_left_context" and + "model_right_context" and values equal to the left/right model contexts + for input_model. This function is useful when using the --trainer.input-model option - instead of initializing model using configs. + instead of initializing the model using configs. """ variables = {} try: diff --git a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py index 6c41ee17eda..4d142ba3266 100644 --- a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py +++ b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py @@ -99,6 +99,7 @@ def train_new_models(dir, iter, srand, num_jobs, else: image_augmentation_cmd = '' + multitask_egs_opts = common_train_lib.get_multitask_egs_opts( egs_dir, egs_prefix="egs.", diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py index 43c685c01cc..ff89687ac16 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py @@ -41,6 +41,17 @@ def __init__(self, first_token, key_to_value, all_layers): if not xutils.is_valid_line_name(self.name): raise RuntimeError("Invalid value: name={0}".format( key_to_value['name'])) + + # It is possible to have two layers with same name if one of them + # is 'existing' layer type. + # Layers of type 'existing' correspond to component-node names + # from existing model that we are adding layers to them. + # These layers are not presented in any config file and new layer + # with same name as these layers can exist in all_layers. + # i.e. output-node with name 'output' in the existing model is added to + # all_layers using layer type 'existing' and it is possible to have + # 'output-node' of type 'output-layer' with same name 'output' in + # all_layers. for prev_layer in all_layers: if (self.name == prev_layer.name and prev_layer.layer_type is not 'existing'): @@ -1083,7 +1094,7 @@ class XconfigExistingLayer(XconfigLayerBase): config files. Layers of this type are created internally for all component nodes in an existing neural net model for use as input to other layers. - (i.e. get_model_component_info, which is called in + (i.e. get_model_component_info function, which is called in steps/nnet3/xconfig_to_configs.py, returns a list of 'existing' layers for component nodes used in 'existing_model') This class is useful in cases like transferring existing model @@ -1119,7 +1130,7 @@ def output_dim(self, auxiliary_outputs=None): return self.config['dim'] def get_full_config(self): - # unlike other layers the auxiliary layers should not to be printed in + # unlike other layers the existing layers should not to be printed in # any '*.config' ans = [] return ans diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py index 63f057148dd..db363c3cd0f 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py @@ -11,6 +11,7 @@ import sys import libs.nnet3.xconfig.layers as xlayers import libs.nnet3.xconfig.utils as xutils + import libs.common as common_lib @@ -67,10 +68,16 @@ def xconfig_line_to_object(config_line, prev_layers = None): "*** {0}".format(config_line)) raise + # This function reads existing model (*.raw or *.mdl) and returns array of # XconfigExistingLayer one per {input,output}-node or component-node with same -# 'name' used in raw model and 'dim' equal to 'output-dim' for component-node +# 'name' used in the raw model and 'dim' equal to 'output-dim' for component-node # and 'dim' for {input,output}-node. +# i.e. layer in *.mdl -> corresponding 'XconfigExistingLayer' layer +# 'input-node name=ivector dim=100' -> +# 'existing name=ivector dim=100' +# 'component-node name=tdnn1.affine ** input-node=1000 output-node=500' -> +# 'existing name=tdnn1.affine dim=500' def get_model_component_info(model_filename): all_layers = [] try: @@ -84,12 +91,13 @@ def get_model_component_info(model_filename): out = common_lib.get_command_stdout("""nnet3-info {0} | grep '\-node' """ """ """.format(model_filename)) - # out contains all {input,component}-nodes used in model_filename + # out contains all {output, input, component}-nodes used in model_filename # It can parse lines in out like: # i.e. input-node name=input dim=40 # component-node name=tdnn1.affine component=tdnn1.affine input=lda # input-dim=300 output-dim=512 layer_names = [] + key_to_value = dict() for line in out.split("\n"): parts = line.split(" ") dim = -1 @@ -106,7 +114,6 @@ def get_model_component_info(model_filename): dim = int(value) if layer_name is not None and layer_name not in layer_names: - key_to_value = dict() layer_names.append(layer_name) key_to_value['name'] = layer_name assert(dim != -1) diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/utils.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/utils.py index dde19ab02c6..6b5cc97eaf9 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/utils.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/utils.py @@ -62,6 +62,7 @@ def get_dim_from_layer_name(all_layers, current_layer, full_layer_name): for layer in all_layers: if layer is current_layer: break + # If 'all_layers' contains some 'existing' layers, i.e. layers which # are not really layers but are actual component names from an existing # neural net that we are adding components to, they may already be @@ -104,6 +105,7 @@ def get_string_from_layer_name(all_layers, current_layer, full_layer_name): for layer in all_layers: if layer is current_layer: break + # The following if-statement is needed to handle the case where the # layer is an 'existing' layer, derived from an existing trained # neural network supplied via the --existing-model option, that we are diff --git a/egs/wsj/s5/steps/nnet3/align_lats.sh b/egs/wsj/s5/steps/nnet3/align_lats.sh index 35f171dc5e4..2f9042467ff 100755 --- a/egs/wsj/s5/steps/nnet3/align_lats.sh +++ b/egs/wsj/s5/steps/nnet3/align_lats.sh @@ -3,6 +3,7 @@ # 2013 Johns Hopkins University (Author: Daniel Povey) # 2015 Vijayaditya Peddinti # 2016 Vimal Manohar +# 2017 Pegah Ghahremani # Apache 2.0 # Computes training alignments using nnet3 DNN, with output to lattices. diff --git a/egs/wsj/s5/steps/nnet3/chain/make_weighted_den_fst.sh b/egs/wsj/s5/steps/nnet3/chain/make_weighted_den_fst.sh index 47e7c73a9f4..5e48acb914b 100755 --- a/egs/wsj/s5/steps/nnet3/chain/make_weighted_den_fst.sh +++ b/egs/wsj/s5/steps/nnet3/chain/make_weighted_den_fst.sh @@ -20,6 +20,8 @@ # first alignment directory to the chain directory. # This script can accept multiple sources of alignments with same phone sets # that can be weighted to estimate phone LM. +# 'weights' is comma-separated list of positive int values used +# to scale different phone sequences for different alignments. # Each alignment directory should contain tree, final.mdl and ali.*.gz. set -o pipefail @@ -27,10 +29,11 @@ set -o pipefail # begin configuration section. cmd=run.pl stage=-10 -weights= # comma-separated list of integer valued scale weights used +weights= # comma-separated list of positive int valued scale weights used # to scale different phone sequences for different alignments. # Scaling the count with i^th int weight 'w' is done by repeating # the i^th phone sequence 'w' times. + # i.e. "1,10" # If not specified, weight '1' is used for all phone sequences. lm_opts='num_extra_lm_state=2000' @@ -71,8 +74,15 @@ cp ${ali_dirs[0]}/tree $dir/ || exit 1 if [ -z $weights ]; then # If 'weights' is not specified, comma-separated array '1' with dim - #'num_alignments' is specified as 'weights'. + #'num_alignments' is defined as 'weights'. for n in `seq 1 $num_alignments`;do weights="$weights,1"; done +else + w_arr=(${weights//,/ }) + num_weights=${#w_arr[@]} + if [ $num_alignments -ne $num_weights ]; then + echo "$0: number of weights in $weight, $num_weights, should be equal to the " + echo "number of alignment directories, $num_alignments." && exit 1; + fi fi if [ $stage -le 1 ]; then diff --git a/egs/wsj/s5/steps/nnet3/chain/train.py b/egs/wsj/s5/steps/nnet3/chain/train.py index 06fa12092f4..00624b09c69 100755 --- a/egs/wsj/s5/steps/nnet3/chain/train.py +++ b/egs/wsj/s5/steps/nnet3/chain/train.py @@ -104,7 +104,7 @@ def get_args(): parser.add_argument("--trainer.input-model", type=str, dest='input_model', default=None, action=common_lib.NullstrToNoneAction, - help="If specified, this model is used as the initial " + help="If specified, this model is used as initial " "'raw' model (0.raw in the script) instead of " "initializing the model from the xconfig. " "Also configs dir is not expected to exist " @@ -325,15 +325,16 @@ def train(args, run_opts): shutil.copy('{0}/tree'.format(args.tree_dir), args.dir) chain_lib.create_denominator_fst(args.dir, args.tree_dir, run_opts) - if ((args.stage <= -4) and (os.path.exists("{0}/configs/init.config".format(args.dir))) - and (args.input_model is None)): + if ((args.stage <= -4) and + (os.path.exists("{0}/configs/init.config".format(args.dir))) + and (args.input_model is None)): logger.info("Initializing a basic network for estimating " "preconditioning matrix") common_lib.execute_command( """{command} {dir}/log/nnet_init.log \ - nnet3-init --srand=-2 {dir}/configs/init.config \ - {dir}/init.raw""".format(command=run_opts.command, - dir=args.dir)) + nnet3-init --srand=-2 {dir}/configs/init.config \ + {dir}/init.raw""".format(command=run_opts.command, + dir=args.dir)) egs_left_context = left_context + args.frame_subsampling_factor / 2 egs_right_context = right_context + args.frame_subsampling_factor / 2 @@ -346,7 +347,7 @@ def train(args, run_opts): right_context_final >= 0 else -1) default_egs_dir = '{0}/egs'.format(args.dir) - if (args.stage <= -3) and args.egs_dir is None: + if ((args.stage <= -3) and args.egs_dir is None): logger.info("Generating egs") if (not os.path.exists("{0}/den.fst".format(args.dir)) or not os.path.exists("{0}/normalization.fst".format(args.dir)) or @@ -401,7 +402,7 @@ def train(args, run_opts): common_train_lib.copy_egs_properties_to_exp_dir(egs_dir, args.dir) if ((args.stage <= -2) and (os.path.exists(args.dir+"/configs/init.config")) - and (args.input_model is None)): + and (args.input_model is None)): logger.info('Computing the preconditioning matrix for input features') chain_lib.compute_preconditioning_matrix( From 82fa510b56ba087e8aa7f29c522e44291ed51c0f Mon Sep 17 00:00:00 2001 From: Pegita Date: Thu, 24 Aug 2017 16:13:19 -0400 Subject: [PATCH 049/174] fixed some issues in python codes using pylint package. --- egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py | 22 +++++---- egs/wsj/s5/steps/nnet3/chain/train.py | 47 ++++++++++--------- egs/wsj/s5/steps/nnet3/xconfig_to_configs.py | 30 ++++++------ 3 files changed, 54 insertions(+), 45 deletions(-) diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py index db363c3cd0f..636f26a1076 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py @@ -69,16 +69,20 @@ def xconfig_line_to_object(config_line, prev_layers = None): raise -# This function reads existing model (*.raw or *.mdl) and returns array of -# XconfigExistingLayer one per {input,output}-node or component-node with same -# 'name' used in the raw model and 'dim' equal to 'output-dim' for component-node -# and 'dim' for {input,output}-node. -# i.e. layer in *.mdl -> corresponding 'XconfigExistingLayer' layer -# 'input-node name=ivector dim=100' -> -# 'existing name=ivector dim=100' -# 'component-node name=tdnn1.affine ** input-node=1000 output-node=500' -> -# 'existing name=tdnn1.affine dim=500' def get_model_component_info(model_filename): + """ This function reads existing model (*.raw or *.mdl) and returns array + of XconfigExistingLayer one per {input,output}-node or component-node + with same 'name' used in the raw model and 'dim' equal to 'output-dim' + for component-node and 'dim' for {input,output}-node. + + i.e. layer in *.mdl -> corresponding 'XconfigExistingLayer' layer + 'input-node name=ivector dim=100' -> + 'existing name=ivector dim=100' + 'component-node name=tdnn1.affine ** input-node=1000 ' + 'output-node=500' -> + 'existing name=tdnn1.affine dim=500' + """ + all_layers = [] try: f = open(model_filename, 'r') diff --git a/egs/wsj/s5/steps/nnet3/chain/train.py b/egs/wsj/s5/steps/nnet3/chain/train.py index 00624b09c69..25a5c4dbc28 100755 --- a/egs/wsj/s5/steps/nnet3/chain/train.py +++ b/egs/wsj/s5/steps/nnet3/chain/train.py @@ -108,7 +108,8 @@ def get_args(): "'raw' model (0.raw in the script) instead of " "initializing the model from the xconfig. " "Also configs dir is not expected to exist " - "and left/right context is computed from this model.") + "and left/right context is computed from this " + "model.") parser.add_argument("--trainer.num-epochs", type=float, dest='num_epochs', default=10.0, help="Number of epochs to train the model") @@ -195,10 +196,10 @@ def process_args(args): """ if not common_train_lib.validate_chunk_width(args.chunk_width): - raise Exception("--egs.chunk-width has an invalid value"); + raise Exception("--egs.chunk-width has an invalid value") if not common_train_lib.validate_minibatch_size_str(args.num_chunk_per_minibatch): - raise Exception("--trainer.num-chunk-per-minibatch has an invalid value"); + raise Exception("--trainer.num-chunk-per-minibatch has an invalid value") if args.chunk_left_context < 0: raise Exception("--egs.chunk-left-context should be non-negative") @@ -222,8 +223,8 @@ def process_args(args): "--trainer.input-model option as initial 'raw' model " "(used as 0.raw in the script) should be supplied or " "{0}/configs directory which is the output of " - "make_configs.py script should be provided.".format( - args.dir)) + "make_configs.py script should be provided." + "".format(args.dir)) if args.transform_dir is None: args.transform_dir = args.lat_dir @@ -281,8 +282,8 @@ def train(args, run_opts): # split the training data into parts for individual jobs # we will use the same number of jobs as that used for alignment - common_lib.execute_command("utils/split_data.sh {0} {1}".format( - args.feat_dir, num_jobs)) + common_lib.execute_command("utils/split_data.sh {0} {1}" + "".format(args.feat_dir, num_jobs)) with open('{0}/num_jobs'.format(args.dir), 'w') as f: f.write(str(num_jobs)) @@ -326,8 +327,8 @@ def train(args, run_opts): chain_lib.create_denominator_fst(args.dir, args.tree_dir, run_opts) if ((args.stage <= -4) and - (os.path.exists("{0}/configs/init.config".format(args.dir))) - and (args.input_model is None)): + os.path.exists("{0}/configs/init.config".format(args.dir)) + and (args.input_model is None)): logger.info("Initializing a basic network for estimating " "preconditioning matrix") common_lib.execute_command( @@ -341,9 +342,11 @@ def train(args, run_opts): # note: the '+ args.frame_subsampling_factor / 2' is to allow for the # fact that we'll be shifting the data slightly during training to give # variety to the training data. - egs_left_context_initial = (left_context_initial + args.frame_subsampling_factor / 2 if + egs_left_context_initial = (left_context_initial + + args.frame_subsampling_factor / 2 if left_context_initial >= 0 else -1) - egs_right_context_final = (right_context_final + args.frame_subsampling_factor / 2 if + egs_right_context_final = (right_context_final + + args.frame_subsampling_factor / 2 if right_context_final >= 0 else -1) default_egs_dir = '{0}/egs'.format(args.dir) @@ -353,8 +356,8 @@ def train(args, run_opts): not os.path.exists("{0}/normalization.fst".format(args.dir)) or not os.path.exists("{0}/tree".format(args.dir))): raise Exception("Chain egs generation expects {0}/den.fst, " - "{0}/normalization.fst and {0}/tree " - "to exist.".format(args.dir)) + "{0}/normalization.fst and {0}/tree " + "to exist.".format(args.dir)) # this is where get_egs.sh is called. chain_lib.generate_chain_egs( dir=args.dir, data=args.feat_dir, @@ -384,11 +387,11 @@ def train(args, run_opts): [egs_left_context, egs_right_context, frames_per_eg_str, num_archives] = ( - common_train_lib.verify_egs_dir(egs_dir, feat_dim, - ivector_dim, ivector_id, - egs_left_context, egs_right_context, - egs_left_context_initial, - egs_right_context_final)) + common_train_lib.verify_egs_dir(egs_dir, feat_dim, + ivector_dim, ivector_id, + egs_left_context, egs_right_context, + egs_left_context_initial, + egs_right_context_final)) assert(args.chunk_width == frames_per_eg_str) num_archives_expanded = num_archives * args.frame_subsampling_factor @@ -472,8 +475,8 @@ def train(args, run_opts): if args.shrink_value < shrinkage_value: shrinkage_value = (args.shrink_value if common_train_lib.should_do_shrinkage( - iter, model_file, - args.shrink_saturation_threshold) + iter, model_file, + args.shrink_saturation_threshold) else shrinkage_value) chain_lib.train_one_iteration( @@ -543,8 +546,8 @@ def train(args, run_opts): logger.info("Copying the last-numbered model to final.mdl") common_lib.force_symlink("{0}.mdl".format(num_iters), "{0}/final.mdl".format(args.dir)) - common_lib.force_symlink("compute_prob_valid.{iter}.log".format( - iter=num_iters-1), + common_lib.force_symlink("compute_prob_valid.{iter}.log" + "".format(iter=num_iters-1), "{dir}/log/compute_prob_valid.final.log".format( dir=args.dir)) diff --git a/egs/wsj/s5/steps/nnet3/xconfig_to_configs.py b/egs/wsj/s5/steps/nnet3/xconfig_to_configs.py index a9325079450..fa9cd317331 100755 --- a/egs/wsj/s5/steps/nnet3/xconfig_to_configs.py +++ b/egs/wsj/s5/steps/nnet3/xconfig_to_configs.py @@ -77,8 +77,8 @@ def backup_xconfig_file(xconfig_file, config_dir): try: xconfig_file_in = open(xconfig_file) except: - raise Exception('{0}: error opening file {1} for input'.format( - sys.argv[0], config_dir)) + raise Exception('{0}: error opening file {1} for input' + ''.format(sys.argv[0], config_dir)) print("# This file was created by the command:\n" "# {0}\n" @@ -208,8 +208,9 @@ def write_config_files(config_dir, all_layers): if basename == 'init': continue # do not write the init.config else: - print('{0}: error in xconfig file {1}: may be lack of a output layer'.format( - sys.argv[0], sys.argv[2]), file=sys.stderr) + print('{0}: error in xconfig file {1}: may be lack of a ' + 'output layer'.format(sys.argv[0], sys.argv[2]), + file=sys.stderr) raise header = config_basename_to_header[basename] @@ -221,8 +222,8 @@ def write_config_files(config_dir, all_layers): print(line, file=f) f.close() except Exception as e: - print('{0}: error writing to config file {1}: error is {2}'.format( - sys.argv[0], filename, repr(e)), file=sys.stderr) + print('{0}: error writing to config file {1}: error is {2}' + ''.format(sys.argv[0], filename, repr(e)), file=sys.stderr) # we use raise rather than raise(e) as using a blank raise # preserves the backtrace raise @@ -233,10 +234,10 @@ def add_nnet_context_info(config_dir, nnet_edits=None, """Create the 'vars' file that specifies model_left_context, etc.""" common_lib.execute_command("nnet3-init {0} {1}/ref.config " - "{1}/ref.raw".format( - existing_model if - existing_model is not None else "", - config_dir)) + "{1}/ref.raw" + "".format(existing_model if + existing_model is not None else "", + config_dir)) model = "{0}/ref.raw".format(config_dir) if nnet_edits is not None: model = "nnet3-copy --edits='{0}' {1} - |".format(nnet_edits, @@ -269,10 +270,11 @@ def check_model_contexts(config_dir, nnet_edits=None, existing_model=None): if os.path.exists('{0}/{1}.config'.format(config_dir, file_name)): contexts[file_name] = {} common_lib.execute_command("nnet3-init {0} {1}/{2}.config " - "{1}/{2}.raw".format( - existing_model if - existing_model is not None else '', - config_dir, file_name)) + "{1}/{2}.raw" + "".format(existing_model if + existing_model is not + None else '', + config_dir, file_name)) model = "{0}/{1}.raw".format(config_dir, file_name) if nnet_edits is not None: model = "nnet3-copy --edits='{0}' {1} - |".format(nnet_edits, From 40dc5e491022831047ed34deed0bcacbfbcc232f Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Thu, 24 Aug 2017 19:31:47 -0400 Subject: [PATCH 050/174] smbr: Fix aux objf --- .../nnet3/train/chain_objf/acoustic_model.py | 3 +- egs/wsj/s5/steps/nnet3/chain/train.py | 4 +- src/chain/chain-denominator-smbr.cc | 24 +++--- src/chain/chain-supervision-test.cc | 2 + src/chain/chain-training.cc | 31 ++++--- src/chain/chain-training.h | 2 +- src/nnet3/nnet-chain-combine.cc | 5 +- src/nnet3/nnet-chain-diagnostics.cc | 79 +++++++++++++++--- src/nnet3/nnet-chain-diagnostics.h | 6 +- src/nnet3/nnet-chain-training.cc | 12 ++- src/nnet3/nnet-diagnostics.h | 1 - src/nnet3/nnet-training.cc | 81 +++++++++++++++---- src/nnet3/nnet-training.h | 46 +++++++++-- 13 files changed, 224 insertions(+), 72 deletions(-) diff --git a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py index 1690d790c32..54fc5697323 100644 --- a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py +++ b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py @@ -301,7 +301,8 @@ def train_one_iteration(dir, iter, srand, egs_dir, if shrinkage_value != 1.0: shrink_info_str = ' and shrink value is {0}'.format(shrinkage_value) - objf_info = "" if smbr_opt != "" else "and objective is sMBR" + objf_info = "" if smbr_opt != "" else ( + "and objective is sMBR and smbr_opt=" + smbr_opt) logger.info("On iteration {0}, learning rate is {1}" "{shrink_info} {objf_info}.".format( iter, learning_rate, diff --git a/egs/wsj/s5/steps/nnet3/chain/train.py b/egs/wsj/s5/steps/nnet3/chain/train.py index 14c6c872ba0..e18ac1fbe7c 100755 --- a/egs/wsj/s5/steps/nnet3/chain/train.py +++ b/egs/wsj/s5/steps/nnet3/chain/train.py @@ -524,7 +524,7 @@ def train(args, run_opts): l2_regularize = (args.smbr_l2_regularize if args.smbr_l2_regularize is not None else args.l2_regularize) - smbr_opt = "--use-smbr-objective" + smbr_opt += " --use-smbr-objective" if silence_pdfs is not None: smbr_opt += " --silence-pdfs=" + silence_pdfs @@ -609,7 +609,7 @@ def train(args, run_opts): l2_regularize = (args.smbr_l2_regularize if args.smbr_l2_regularize is not None else args.l2_regularize) - smbr_opt = "--use-smbr-objective" + smbr_opt += " --use-smbr-objective" if silence_pdfs is not None: smbr_opt += " --silence-pdfs=" + silence_pdfs diff --git a/src/chain/chain-denominator-smbr.cc b/src/chain/chain-denominator-smbr.cc index 87eecbb0073..e365599eb4b 100644 --- a/src/chain/chain-denominator-smbr.cc +++ b/src/chain/chain-denominator-smbr.cc @@ -570,18 +570,18 @@ void DenominatorSmbrComputation::BetaSmbrGeneralFrameDebug(int32 t) { << alpha_beta_smbr_sum << " != " << tot_smbr_sum; } - // use higher tolerance, since we are using randomized pruning for the - // log-prob derivatives. - if (GetVerboseLevel() > 1 || !ApproxEqual( - this_log_prob_deriv_sum, -opts_.mmi_factor * num_sequences_, 0.01)) { - KALDI_WARN << "On time " << t << ", log-prob-deriv sum " - << this_log_prob_deriv_sum << " != " - << opts_.mmi_factor * num_sequences_; - if (fabs(this_log_prob_deriv_sum + opts_.mmi_factor * num_sequences_) > 2.0) { - KALDI_WARN << "Excessive error detected, will abandon this minibatch"; - ok_ = false; - } - } + //// use higher tolerance, since we are using randomized pruning for the + //// log-prob derivatives. + //if (GetVerboseLevel() > 1 || !ApproxEqual( + // this_log_prob_deriv_sum, -opts_.mmi_factor * num_sequences_, 0.01)) { + // KALDI_WARN << "On time " << t << ", log-prob-deriv sum " + // << this_log_prob_deriv_sum << " != " + // << -opts_.mmi_factor * num_sequences_; + // if (fabs(this_log_prob_deriv_sum + opts_.mmi_factor * num_sequences_) > 2.0) { + // KALDI_WARN << "Excessive error detected, will abandon this minibatch"; + // ok_ = false; + // } + //} } diff --git a/src/chain/chain-supervision-test.cc b/src/chain/chain-supervision-test.cc index eb5e263427a..1d055ab96ec 100644 --- a/src/chain/chain-supervision-test.cc +++ b/src/chain/chain-supervision-test.cc @@ -387,6 +387,8 @@ void ChainSmbrTrainingTest(const DenominatorGraph &den_graph, kUndefined); KALDI_LOG << "LF-SMBR training"; opts.use_smbr_objective = true; + opts.mmi_factor = 0.0; + opts.smbr_factor = 1.0; BaseFloat objf, l2_term, weight; ComputeChainSmbrObjfAndDeriv(opts, den_graph, supervision, nnet_output, &objf, &l2_term, &weight, diff --git a/src/chain/chain-training.cc b/src/chain/chain-training.cc index fcde7d735f9..344e0613f3f 100644 --- a/src/chain/chain-training.cc +++ b/src/chain/chain-training.cc @@ -116,6 +116,7 @@ void ComputeChainSmbrObjfAndDeriv(const ChainTrainingOptions &opts, const Supervision &supervision, const CuMatrixBase &nnet_output, BaseFloat *objf, + BaseFloat *mmi_objf, BaseFloat *l2_term, BaseFloat *weight, CuMatrixBase *nnet_output_deriv, @@ -154,15 +155,15 @@ void ComputeChainSmbrObjfAndDeriv(const ChainTrainingOptions &opts, supervision.num_sequences, nnet_output, num_posteriors); - BaseFloat mmi_objf; - BaseFloat smbr_objf = denominator.ForwardSmbr(&mmi_objf); - - if (opts.mmi_factor != 0.0) { - DenominatorComputation denominator_mmi(opts, den_graph, - supervision.num_sequences, - nnet_output); - KALDI_ASSERT(kaldi::ApproxEqual(-mmi_objf, opts.mmi_factor * denominator_mmi.Forward())); - } + BaseFloat den_logprob_negated; + BaseFloat smbr_objf = denominator.ForwardSmbr(&den_logprob_negated); + + //if (opts.mmi_factor != 0.0) { + // DenominatorComputation denominator_mmi(opts, den_graph, + // supervision.num_sequences, + // nnet_output); + // KALDI_ASSERT(kaldi::ApproxEqual(-den_logprob_negated, opts.mmi_factor * denominator_mmi.Forward())); + //} bool ok = true; if (nnet_output_deriv) { @@ -170,22 +171,26 @@ void ComputeChainSmbrObjfAndDeriv(const ChainTrainingOptions &opts, ok = denominator.BackwardSmbr(supervision.weight, nnet_output_deriv); } - *objf = supervision.weight * (smbr_objf + mmi_objf) + num_logprob_weighted; + *objf = supervision.weight * smbr_objf; + *mmi_objf = supervision.weight * den_logprob_negated + num_logprob_weighted; *weight = supervision.weight * supervision.num_sequences * supervision.frames_per_sequence; - if (!((*objf) - (*objf) == 0) || !ok) { + + BaseFloat total_objf = *objf + *mmi_objf; + if (!((total_objf) - (total_objf) == 0) || !ok) { // inf or NaN detected, or denominator computation returned false. if (nnet_output_deriv) nnet_output_deriv->SetZero(); if (xent_output_deriv) xent_output_deriv->SetZero(); BaseFloat default_objf = -opts.mmi_factor * 10; - KALDI_WARN << "Objective function is " << (*objf) + KALDI_WARN << "Objective function is " << (total_objf) << " and denominator computation (if done) returned " << std::boolalpha << ok << ", setting objective function to " << default_objf << " per frame."; - *objf = default_objf * *weight; + *mmi_objf = default_objf * *weight; + *objf = 0.0; } // This code helps us see how big the derivatives are, on average, diff --git a/src/chain/chain-training.h b/src/chain/chain-training.h index 798b342316d..221d0ed78f5 100644 --- a/src/chain/chain-training.h +++ b/src/chain/chain-training.h @@ -177,7 +177,7 @@ void ComputeChainSmbrObjfAndDeriv( const DenominatorGraph &den_graph, const Supervision &supervision, const CuMatrixBase &nnet_output, - BaseFloat *objf, + BaseFloat *objf, BaseFloat *mmi_objf, BaseFloat *l2_term, BaseFloat *weight, CuMatrixBase *nnet_output_deriv, diff --git a/src/nnet3/nnet-chain-combine.cc b/src/nnet3/nnet-chain-combine.cc index c93858fb06e..626e8448380 100644 --- a/src/nnet3/nnet-chain-combine.cc +++ b/src/nnet3/nnet-chain-combine.cc @@ -52,7 +52,8 @@ NnetChainCombiner::NnetChainCombiner(const NnetCombineConfig &combine_config, ComputeUpdatableComponentDims(); NnetComputeProbOptions compute_prob_opts; compute_prob_opts.compute_deriv = true; - prob_computer_ = new NnetChainComputeProb(compute_prob_opts, chain_config_, den_fst_, nnet_); + prob_computer_ = new NnetChainComputeProb(compute_prob_opts, chain_config_, + den_fst_, nnet_); } void NnetChainCombiner::ComputeUpdatableComponentDims(){ @@ -514,7 +515,7 @@ double NnetChainCombiner::ComputeObjfAndDerivFromNnet( VectorizeNnet(deriv, nnet_params_deriv); // we prefer to deal with normalized objective functions. nnet_params_deriv->Scale(1.0 / objf_info->tot_weight); - return (objf_info->tot_like + objf_info->tot_l2_term) / objf_info->tot_weight; + return (objf_info->tot_like + objf_info->tot_aux_objfs.Sum()) / objf_info->tot_weight; } diff --git a/src/nnet3/nnet-chain-diagnostics.cc b/src/nnet3/nnet-chain-diagnostics.cc index 1b1c630d4dc..709ad0bce8b 100644 --- a/src/nnet3/nnet-chain-diagnostics.cc +++ b/src/nnet3/nnet-chain-diagnostics.cc @@ -44,6 +44,30 @@ NnetChainComputeProb::NnetChainComputeProb( KALDI_ERR << "If you set store_component_stats == true and " << "compute_deriv == false, use the other constructor."; } + + if (!chain_config.silence_pdfs_str.empty()) { + std::vector silence_pdfs; + SplitStringToVector(chain_config.silence_pdfs_str, ":,", false, + &silence_pdfs); + + int32 num_pdfs = nnet.OutputDim("output"); + std::vector indices(num_pdfs); + for (size_t i = 0; i < num_pdfs; i++) { + indices[i] = i; + } + + for (std::vector::iterator it = silence_pdfs.begin(); + it != silence_pdfs.end(); ++it) { + int32 pdf = std::atoi(it->c_str()); + if (pdf > num_pdfs) + KALDI_ERR << "Invalid pdf " << pdf << " in silence-pdfs " + << chain_config.silence_pdfs_str; + indices[pdf] = -1; + } + + sil_indices_.Resize(num_pdfs); + sil_indices_.CopyFromVec(indices); + } } @@ -62,6 +86,30 @@ NnetChainComputeProb::NnetChainComputeProb( num_minibatches_processed_(0) { KALDI_ASSERT(den_graph_.NumPdfs() > 0); KALDI_ASSERT(nnet_config.store_component_stats && !nnet_config.compute_deriv); + + if (!chain_config.silence_pdfs_str.empty()) { + std::vector silence_pdfs; + SplitStringToVector(chain_config.silence_pdfs_str, ":,", false, + &silence_pdfs); + + int32 num_pdfs = nnet->OutputDim("output"); + std::vector indices(num_pdfs); + for (size_t i = 0; i < num_pdfs; i++) { + indices[i] = i; + } + + for (std::vector::iterator it = silence_pdfs.begin(); + it != silence_pdfs.end(); ++it) { + int32 pdf = std::atoi(it->c_str()); + if (pdf > num_pdfs) + KALDI_ERR << "Invalid pdf " << pdf << " in silence-pdfs " + << chain_config.silence_pdfs_str; + indices[pdf] = -1; + } + + sil_indices_.Resize(num_pdfs); + sil_indices_.CopyFromVec(indices); + } } @@ -136,15 +184,16 @@ void NnetChainComputeProb::ProcessOutputs(const NnetChainExample &eg, xent_deriv.Resize(nnet_output.NumRows(), nnet_output.NumCols(), kUndefined); - BaseFloat tot_like, tot_l2_term, tot_weight; + BaseFloat tot_like, tot_mmi_objf, tot_l2_term, tot_weight; if (chain_config_.use_smbr_objective) ComputeChainSmbrObjfAndDeriv( chain_config_, den_graph_, sup.supervision, nnet_output, - &tot_like, &tot_l2_term, &tot_weight, + &tot_like, &tot_mmi_objf, &tot_l2_term, &tot_weight, (nnet_config_.compute_deriv ? &nnet_output_deriv : - NULL), (use_xent ? &xent_deriv : NULL)); + NULL), (use_xent ? &xent_deriv : NULL), + sil_indices_.Dim() ? &sil_indices_ : NULL); else ComputeChainObjfAndDeriv(chain_config_, den_graph_, sup.supervision, nnet_output, @@ -160,10 +209,15 @@ void NnetChainComputeProb::ProcessOutputs(const NnetChainExample &eg, // and conjugate gradient descent both rely on the derivatives being // accurate, and don't fail gracefully if the derivatives are not accurate). + std::vector aux_objfs; + aux_objfs.push_back(tot_l2_term); + if (chain_config_.use_smbr_objective) + aux_objfs.push_back(tot_mmi_objf); + ChainObjectiveInfo &totals = objf_info_[sup.name]; totals.tot_weight += tot_weight; totals.tot_like += tot_like; - totals.tot_l2_term += tot_l2_term; + totals.tot_aux_objfs.Add(aux_objfs); if (nnet_config_.compute_deriv) computer->AcceptInput(sup.name, &nnet_output_deriv); @@ -195,10 +249,13 @@ bool NnetChainComputeProb::PrintTotalStats() const { int32 node_index = nnet_.GetNodeIndex(name); KALDI_ASSERT(node_index >= 0); const ChainObjectiveInfo &info = iter->second; - BaseFloat like = (info.tot_like / info.tot_weight), - l2_term = (info.tot_l2_term / info.tot_weight), - tot_objf = like + l2_term; - if (info.tot_l2_term == 0.0) { + BaseFloat like = (info.tot_like / info.tot_weight); + + ObjectiveValues aux_objfs(info.tot_aux_objfs); + aux_objfs.Scale(1.0 / info.tot_weight); + BaseFloat tot_objf = like + aux_objfs.Sum(); + + if (info.tot_aux_objfs.IsZero()) { KALDI_LOG << "Overall log-probability for '" << name << "' is " << like << " per frame" @@ -206,7 +263,8 @@ bool NnetChainComputeProb::PrintTotalStats() const { } else { KALDI_LOG << "Overall log-probability for '" << name << "' is " - << like << " + " << l2_term << " = " << tot_objf << " per frame" + << like << " + " << info.tot_aux_objfs.Str() + << " = " << tot_objf << " per frame" << ", over " << info.tot_weight << " frames."; } if (info.tot_weight > 0) @@ -243,8 +301,7 @@ void RecomputeStats(const std::vector &egs, ZeroComponentStats(nnet); NnetComputeProbOptions nnet_config; nnet_config.store_component_stats = true; - NnetChainComputeProb prob_computer(nnet_config, chain_config, den_fst, - *nnet); + NnetChainComputeProb prob_computer(nnet_config, chain_config, den_fst, nnet); for (size_t i = 0; i < egs.size(); i++) prob_computer.Compute(egs[i]); prob_computer.PrintTotalStats(); diff --git a/src/nnet3/nnet-chain-diagnostics.h b/src/nnet3/nnet-chain-diagnostics.h index 4125427c463..93e06094c52 100644 --- a/src/nnet3/nnet-chain-diagnostics.h +++ b/src/nnet3/nnet-chain-diagnostics.h @@ -36,10 +36,9 @@ namespace nnet3 { struct ChainObjectiveInfo { double tot_weight; double tot_like; - double tot_l2_term; + ObjectiveValues tot_aux_objfs; ChainObjectiveInfo(): tot_weight(0.0), - tot_like(0.0), - tot_l2_term(0.0) { } + tot_like(0.0) { } }; @@ -103,6 +102,7 @@ class NnetChainComputeProb { unordered_map objf_info_; + CuArray sil_indices_; }; /// This function zeros the stored component-level stats in the nnet using diff --git a/src/nnet3/nnet-chain-training.cc b/src/nnet3/nnet-chain-training.cc index 695bdb4f83c..6c6ecd1685a 100644 --- a/src/nnet3/nnet-chain-training.cc +++ b/src/nnet3/nnet-chain-training.cc @@ -202,12 +202,13 @@ void NnetChainTrainer::ProcessOutputs(bool is_backstitch_step2, xent_deriv.Resize(nnet_output.NumRows(), nnet_output.NumCols(), kUndefined); - BaseFloat tot_objf, tot_l2_term, tot_weight; + BaseFloat tot_objf, tot_mmi_objf, tot_l2_term, tot_weight; if (opts_.chain_config.use_smbr_objective) { ComputeChainSmbrObjfAndDeriv(opts_.chain_config, den_graph_, sup.supervision, nnet_output, - &tot_objf, &tot_l2_term, &tot_weight, + &tot_objf, &tot_mmi_objf, + &tot_l2_term, &tot_weight, &nnet_output_deriv, (use_xent ? &xent_deriv : NULL), sil_indices_.Dim() ? &sil_indices_ : NULL); @@ -249,10 +250,15 @@ void NnetChainTrainer::ProcessOutputs(bool is_backstitch_step2, computer->AcceptInput(sup.name, &nnet_output_deriv); + std::vector objective_values; + objective_values.push_back(tot_l2_term); + if (opts_.chain_config.use_smbr_objective) + objective_values.push_back(tot_mmi_objf); + objf_info_[sup.name + suffix].UpdateStats(sup.name + suffix, opts_.nnet_config.print_interval, num_minibatches_processed_, - tot_weight, tot_objf, tot_l2_term); + tot_weight, tot_objf, objective_values); if (use_xent) { xent_deriv.Scale(opts_.chain_config.xent_regularize); diff --git a/src/nnet3/nnet-diagnostics.h b/src/nnet3/nnet-diagnostics.h index bdb2c037f26..eb96381fb94 100644 --- a/src/nnet3/nnet-diagnostics.h +++ b/src/nnet3/nnet-diagnostics.h @@ -39,7 +39,6 @@ struct SimpleObjectiveInfo { }; - struct NnetComputeProbOptions { bool debug_computation; bool compute_deriv; diff --git a/src/nnet3/nnet-training.cc b/src/nnet3/nnet-training.cc index 7428818014c..3b4502b0358 100644 --- a/src/nnet3/nnet-training.cc +++ b/src/nnet3/nnet-training.cc @@ -214,13 +214,62 @@ void NnetTrainer::PrintMaxChangeStats() const { << " \% of the time."; } +ObjectiveValues::ObjectiveValues(const std::vector &values) { + for (std::vector::const_iterator it = values.begin(); + it != values.end(); ++it) { + objective_values.push_back(*it); + } +} + +void ObjectiveValues::Add(const ObjectiveValues &other) { + if (Size() != other.Size()) { + KALDI_ERR << "objective values must have same size."; + } + + for (size_t i = 0; i < Size(); i++) { + objective_values[i] += other.objective_values[i]; + } +} + +void ObjectiveValues::Scale(BaseFloat scale) { + for (std::vector::iterator it = objective_values.begin(); + it != objective_values.end(); ++it) { + *it *= scale; + } +} + +bool ObjectiveValues::IsZero() const { + for (std::vector::const_iterator it = objective_values.begin(); + it != objective_values.end(); ++it) { + if (*it != 0.0) return false; + } + return true; +} + +double ObjectiveValues::Sum() const { + double sum = 0.0; + for (std::vector::const_iterator it = objective_values.begin(); + it != objective_values.end(); ++it) { + sum += *it; + } + return sum; +} + +std::string ObjectiveValues::Str() const { + std::ostringstream oss; + for (size_t i = 0; i < Size(); i++) { + oss << objective_values[i] << (i < Size() - 1 ? " + " : ""); + } + return oss.str(); +} + void ObjectiveFunctionInfo::UpdateStats( const std::string &output_name, int32 minibatches_per_phase, int32 minibatch_counter, BaseFloat this_minibatch_weight, BaseFloat this_minibatch_tot_objf, - BaseFloat this_minibatch_tot_aux_objf) { + const ObjectiveValues &this_minibatch_tot_aux_objfs) { int32 phase = minibatch_counter / minibatches_per_phase; if (phase != current_phase) { KALDI_ASSERT(phase > current_phase); @@ -229,16 +278,16 @@ void ObjectiveFunctionInfo::UpdateStats( current_phase = phase; tot_weight_this_phase = 0.0; tot_objf_this_phase = 0.0; - tot_aux_objf_this_phase = 0.0; + tot_aux_objfs_this_phase.Reset(); minibatches_this_phase = 0; } minibatches_this_phase++; tot_weight_this_phase += this_minibatch_weight; tot_objf_this_phase += this_minibatch_tot_objf; - tot_aux_objf_this_phase += this_minibatch_tot_aux_objf; + tot_aux_objfs_this_phase.Add(this_minibatch_tot_aux_objfs); tot_weight += this_minibatch_weight; tot_objf += this_minibatch_tot_objf; - tot_aux_objf += this_minibatch_tot_aux_objf; + tot_aux_objfs.Add(this_minibatch_tot_aux_objfs); } void ObjectiveFunctionInfo::PrintStatsForThisPhase( @@ -248,7 +297,7 @@ void ObjectiveFunctionInfo::PrintStatsForThisPhase( int32 start_minibatch = current_phase * minibatches_per_phase, end_minibatch = phase * minibatches_per_phase - 1; - if (tot_aux_objf_this_phase == 0.0) { + if (tot_aux_objfs_this_phase.IsZero()) { if (minibatches_per_phase == minibatches_this_phase) { KALDI_LOG << "Average objective function for '" << output_name << "' for minibatches " << start_minibatch @@ -264,36 +313,38 @@ void ObjectiveFunctionInfo::PrintStatsForThisPhase( << tot_weight_this_phase << " frames."; } } else { - BaseFloat objf = (tot_objf_this_phase / tot_weight_this_phase), - aux_objf = (tot_aux_objf_this_phase / tot_weight_this_phase), - sum_objf = objf + aux_objf; + BaseFloat objf = (tot_objf_this_phase / tot_weight_this_phase); + ObjectiveValues aux_objfs(tot_aux_objfs_this_phase); + aux_objfs.Scale(1.0 / tot_weight_this_phase); + BaseFloat sum_objf = objf + aux_objfs.Sum(); if (minibatches_per_phase == minibatches_this_phase) { KALDI_LOG << "Average objective function for '" << output_name << "' for minibatches " << start_minibatch << '-' << end_minibatch << " is " - << objf << " + " << aux_objf << " = " << sum_objf + << objf << " + " << aux_objfs.Str() << " = " << sum_objf << " over " << tot_weight_this_phase << " frames."; } else { KALDI_LOG << "Average objective function for '" << output_name << "' using " << minibatches_this_phase << " minibatches in minibatch range " << start_minibatch << '-' << end_minibatch << " is " - << objf << " + " << aux_objf << " = " << sum_objf + << objf << " + " << aux_objfs.Str() << " = " << sum_objf << " over " << tot_weight_this_phase << " frames."; } } } bool ObjectiveFunctionInfo::PrintTotalStats(const std::string &name) const { - BaseFloat objf = (tot_objf / tot_weight), - aux_objf = (tot_aux_objf / tot_weight), - sum_objf = objf + aux_objf; - if (tot_aux_objf == 0.0) { + BaseFloat objf = (tot_objf / tot_weight); + ObjectiveValues aux_objfs(tot_aux_objfs); + aux_objfs.Scale(1.0 / tot_weight); + BaseFloat sum_objf = objf + aux_objfs.Sum(); + if (tot_aux_objfs.IsZero()) { KALDI_LOG << "Overall average objective function for '" << name << "' is " << (tot_objf / tot_weight) << " over " << tot_weight << " frames."; } else { KALDI_LOG << "Overall average objective function for '" << name << "' is " - << objf << " + " << aux_objf << " = " << sum_objf + << objf << " + " << aux_objfs.Str() << " = " << sum_objf << " over " << tot_weight << " frames."; } diff --git a/src/nnet3/nnet-training.h b/src/nnet3/nnet-training.h index 42837df1f86..b321a84d2d8 100644 --- a/src/nnet3/nnet-training.h +++ b/src/nnet3/nnet-training.h @@ -100,6 +100,34 @@ struct NnetTrainerOptions { } }; +// This struct is used to store multiple objective function values +// and do basic operations on all of them. +struct ObjectiveValues { + std::vector objective_values; + + ObjectiveValues() { } + + ObjectiveValues(const std::vector &values): + objective_values(values) { } + + ObjectiveValues(const std::vector &values); + + int32 Size() const { return objective_values.size(); } + + void Add(const ObjectiveValues &other); + + void Scale(BaseFloat scale); + + void Reset() { Scale(0.0); } + + bool IsZero() const; + + double Sum() const; + + std::string Str() const; +}; + + // This struct is used in multiple nnet training classes for keeping // track of objective function values. // Also see struct AccuracyInfo, in nnet-diagnostics.h. @@ -110,22 +138,23 @@ struct ObjectiveFunctionInfo { // 'current_phase'. double tot_weight; double tot_objf; - double tot_aux_objf; // An 'auxiliary' objective function that is optional- - // may be used when things like regularization are being - // used. + + // A struct used to store 'auxiliary' objective function values + // that is optional- may be used when things like regularization are being + // used. + ObjectiveValues tot_aux_objfs; double tot_weight_this_phase; double tot_objf_this_phase; - double tot_aux_objf_this_phase; + ObjectiveValues tot_aux_objfs_this_phase; CuVector deriv_sum; ObjectiveFunctionInfo(): current_phase(0), minibatches_this_phase(0), - tot_weight(0.0), tot_objf(0.0), tot_aux_objf(0.0), - tot_weight_this_phase(0.0), tot_objf_this_phase(0.0), - tot_aux_objf_this_phase(0.0) { } + tot_weight(0.0), tot_objf(0.0), + tot_weight_this_phase(0.0), tot_objf_this_phase(0.0) { } // This function updates the stats and, if the phase has just changed, // prints a message indicating progress. The phase equals @@ -136,7 +165,8 @@ struct ObjectiveFunctionInfo { int32 minibatch_counter, BaseFloat this_minibatch_weight, BaseFloat this_minibatch_tot_objf, - BaseFloat this_minibatch_tot_aux_objf = 0.0); + const ObjectiveValues &this_minibatch_tot_aux_objfs + = ObjectiveValues()); // Prints stats for the current phase. // Note: 'phase' will normally be this->current_phase + 1, but may under From a856deafdae2efcadb05ce332e6a593c1a06299d Mon Sep 17 00:00:00 2001 From: pegahgh Date: Sat, 26 Aug 2017 18:16:58 -0400 Subject: [PATCH 051/174] Update parser.py --- egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py index 636f26a1076..62290606129 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py @@ -78,8 +78,8 @@ def get_model_component_info(model_filename): i.e. layer in *.mdl -> corresponding 'XconfigExistingLayer' layer 'input-node name=ivector dim=100' -> 'existing name=ivector dim=100' - 'component-node name=tdnn1.affine ** input-node=1000 ' - 'output-node=500' -> + 'component-node name=tdnn1.affine ** input-dim=1000 ' + 'output-dim=500' -> 'existing name=tdnn1.affine dim=500' """ From 55a64ff0bb8b3c199eb07ddfcba939e4fbd3a862 Mon Sep 17 00:00:00 2001 From: pegahgh Date: Wed, 30 Aug 2017 11:43:44 -0400 Subject: [PATCH 052/174] Update run_tdnn_wsj_rm_1c.sh --- egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1c.sh | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1c.sh b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1c.sh index 34b91e2fd13..8528d8afb37 100755 --- a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1c.sh +++ b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1c.sh @@ -1,21 +1,21 @@ #!/bin/bash # _1c is as _1b but it uses source chain-trained DNN model instead of GMM model -# to generate alignments for RM using SWJ model. +# to generate alignments for RM using WSJ model. # _1b is as _1a, but different as follows # 1) It uses wsj phone set phones.txt and new lexicon generated using word pronunciation -# in swj lexincon.txt. rm words, that are not presented in wsj, are added as oov +# in wsj lexicon.txt. rm words, that are not presented in wsj, are added as oov # in new lexicon.txt. # 2) It uses wsj tree-dir and generates new alignments and lattices for rm using # wsj gmm model. # 3) It also trains phone LM using weighted combination of alignemts from wsj # and rm, which is used in chain denominator graph. # Since we use phone.txt from source dataset, this can be helpful in cases -# where there is few training data in the target domain and some 4-gram phone +# where there is a few training data in the target domain and some 4-gram phone # sequences have no count in the target domain. -# 4) It uses whole already-trained model and does not replace the output layer -# from already-trained model with new randomely initialized output layer and -# re-train it using target dataset. +# 4) It transfers all layers in already-trained model and +# re-train the last layer using target dataset, instead of replacing it +# with new randomely initialized output layer. # This script uses weight transfer as Transfer learning method # and use already trained model on wsj and fine-tune the whole network using @@ -38,7 +38,7 @@ common_egs_dir= primary_lr_factor=0.25 # learning-rate factor for all except last layer in transferred source model nnet_affix=_online_wsj -phone_lm_scales="1,10" # comma-separated list of integer valued scale weights +phone_lm_scales="1,10" # comma-separated list of int valued scale weights # to scale different phone sequences for different alignments # e.g. (src-weight,target-weight)=(10,1) From c2593d86428ae430546a6ff9a87666e4894e0151 Mon Sep 17 00:00:00 2001 From: pegahgh Date: Wed, 30 Aug 2017 12:13:55 -0400 Subject: [PATCH 053/174] Update basic_layers.py --- .../steps/libs/nnet3/xconfig/basic_layers.py | 39 ++++++++++--------- 1 file changed, 21 insertions(+), 18 deletions(-) diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py index ff89687ac16..10d8af6385b 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py @@ -42,16 +42,16 @@ def __init__(self, first_token, key_to_value, all_layers): raise RuntimeError("Invalid value: name={0}".format( key_to_value['name'])) - # It is possible to have two layers with same name if one of them - # is 'existing' layer type. - # Layers of type 'existing' correspond to component-node names - # from existing model that we are adding layers to them. - # These layers are not presented in any config file and new layer - # with same name as these layers can exist in all_layers. - # i.e. output-node with name 'output' in the existing model is added to - # all_layers using layer type 'existing' and it is possible to have - # 'output-node' of type 'output-layer' with same name 'output' in - # all_layers. + # It is possible to have two layers with a same name in 'all_layer', if + # the layer type for one of them is 'existing'. + # Layers of type 'existing' are corresponding to the component-node names + # in the existing model, which we are adding layers to them. + # 'existing' layers are not presented in any config file, and new layer + # with the same name can exist in 'all_layers'. + # e.g. It is possible to have 'output-node' with name 'output' in the + # existing model, which is added to all_layers using layer type 'existing', + # and 'output-node' of type 'output-layer' with the same name 'output' in + # 'all_layers'. for prev_layer in all_layers: if (self.name == prev_layer.name and prev_layer.layer_type is not 'existing'): @@ -1088,18 +1088,21 @@ def get_full_config(self): class XconfigExistingLayer(XconfigLayerBase): - """This class is for lines like - 'existing name=tdnn1.affine dim=40' - This layer contains 'dim' and 'name' and it is not presented in any actual - config files. - Layers of this type are created internally for all component nodes in - an existing neural net model for use as input to other layers. + """ + This class is for lines like + 'existing name=tdnn1.affine dim=40'. + + This layer contains 'dim' and 'name' and it is not presented in + any actual config files. + Layers of this type are created internally for all component nodes + in an existing neural net model to use as input to other layers. (i.e. get_model_component_info function, which is called in steps/nnet3/xconfig_to_configs.py, returns a list of 'existing' layers for component nodes used in 'existing_model') + This class is useful in cases like transferring existing model - and using {input, output, component}-nodes - in the model as input to new layers. + and using {input, output, component}-nodes in this model as + input to new layers. """ def __init__(self, first_token, key_to_value, prev_names=None): From 26b4ddd81647d7cebabce7c4f9b67bafeef6a1d6 Mon Sep 17 00:00:00 2001 From: pegahgh Date: Wed, 30 Aug 2017 12:21:42 -0400 Subject: [PATCH 054/174] Update parser.py --- egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py | 29 ++++++++++--------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py index 62290606129..b6634e23d3e 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py @@ -70,17 +70,18 @@ def xconfig_line_to_object(config_line, prev_layers = None): def get_model_component_info(model_filename): - """ This function reads existing model (*.raw or *.mdl) and returns array - of XconfigExistingLayer one per {input,output}-node or component-node - with same 'name' used in the raw model and 'dim' equal to 'output-dim' - for component-node and 'dim' for {input,output}-node. - - i.e. layer in *.mdl -> corresponding 'XconfigExistingLayer' layer - 'input-node name=ivector dim=100' -> - 'existing name=ivector dim=100' - 'component-node name=tdnn1.affine ** input-dim=1000 ' - 'output-dim=500' -> - 'existing name=tdnn1.affine dim=500' + """ + This function reads existing model (*.raw or *.mdl) and returns array + of XconfigExistingLayer one per {input,output}-node or component-node + with same 'name' used in the raw model and 'dim' equal to 'output-dim' + for component-node and 'dim' for {input,output}-node. + + e.g. layer in *.mdl -> corresponding 'XconfigExistingLayer' layer + 'input-node name=ivector dim=100' -> + 'existing name=ivector dim=100' + 'component-node name=tdnn1.affine ... input-dim=1000 ' + 'output-dim=500' -> + 'existing name=tdnn1.affine dim=500' """ all_layers = [] @@ -134,10 +135,10 @@ def get_model_component_info(model_filename): # (usually we use the variable name 'all_layers' elsewhere for this). # It will die if the xconfig file is empty or if there was # some error parsing it. -# 'existing_layers' contains some 'existing' layers (layers which are not really +# 'existing_layers' contains some layers of type 'existing' (layers which are not really # layers but are actual component node names from an existing neural net model -# and created using get_model_component_info function.) -# that can be used as input to component-nodes in layers xconfig file. +# and created using get_model_component_info function). +# 'existing' layers can be used as input to component-nodes in layers of xconfig file. def read_xconfig_file(xconfig_filename, existing_layers=[]): try: f = open(xconfig_filename, 'r') From 90fc04ab237097efe9d12a3601ea2780d72674a8 Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Fri, 1 Sep 2017 12:20:22 -0400 Subject: [PATCH 055/174] chain: objective function fixes --- src/nnet3/nnet-chain-combine.cc | 4 ++-- src/nnet3/nnet-chain-diagnostics.cc | 9 +++++---- src/nnet3/nnet-training.cc | 9 +++++++++ src/nnet3/nnet-training.h | 1 + 4 files changed, 17 insertions(+), 6 deletions(-) diff --git a/src/nnet3/nnet-chain-combine.cc b/src/nnet3/nnet-chain-combine.cc index a6c189e1867..05e35422dd0 100644 --- a/src/nnet3/nnet-chain-combine.cc +++ b/src/nnet3/nnet-chain-combine.cc @@ -514,8 +514,8 @@ double NnetChainCombiner::ComputeObjfAndDerivFromNnet( const Nnet &deriv = prob_computer_->GetDeriv(); VectorizeNnet(deriv, nnet_params_deriv); // we prefer to deal with normalized objective functions. - nnet_params_deriv->Scale(1.0 / objf_info->tot_weight); - return (objf_info->tot_like + objf_info->tot_aux_objfs.Sum()) / objf_info->tot_weight; + nnet_params_deriv->Scale(1.0 / tot_weight); + return (tot_objf / tot_weight); } diff --git a/src/nnet3/nnet-chain-diagnostics.cc b/src/nnet3/nnet-chain-diagnostics.cc index 88014ffe144..c31c0ed90ac 100644 --- a/src/nnet3/nnet-chain-diagnostics.cc +++ b/src/nnet3/nnet-chain-diagnostics.cc @@ -263,7 +263,7 @@ bool NnetChainComputeProb::PrintTotalStats() const { } else { KALDI_LOG << "Overall log-probability for '" << name << "' is " - << like << " + " << info.tot_aux_objfs.Str() + << like << " + " << aux_objfs.Str() << " = " << tot_objf << " per frame" << ", over " << info.tot_weight << " frames."; } @@ -285,9 +285,10 @@ std::pair NnetChainComputeProb::GetTotalObjective() const int32 node_index = nnet_.GetNodeIndex(name); KALDI_ASSERT(node_index >= 0); const ChainObjectiveInfo &info = iter->second; - BaseFloat like = (info.tot_like / info.tot_weight), - l2_term = (info.tot_l2_term / info.tot_weight); - tot_objf += like + l2_term; + BaseFloat like = (info.tot_like / info.tot_weight); + ObjectiveValues aux_objfs(info.tot_aux_objfs); + aux_objfs.Scale(info.tot_weight); + tot_objf += like + aux_objfs.Sum(); tot_weight += info.tot_weight; } return std::make_pair(tot_objf, tot_weight); diff --git a/src/nnet3/nnet-training.cc b/src/nnet3/nnet-training.cc index 3b4502b0358..5e1e01b1106 100644 --- a/src/nnet3/nnet-training.cc +++ b/src/nnet3/nnet-training.cc @@ -221,7 +221,16 @@ ObjectiveValues::ObjectiveValues(const std::vector &values) { } } +void ObjectiveValues::Resize(int32 size) { + objective_values.clear(); + objective_values.resize(size); +} + void ObjectiveValues::Add(const ObjectiveValues &other) { + if (Size() == 0) { + Resize(other.Size()); + } + if (Size() != other.Size()) { KALDI_ERR << "objective values must have same size."; } diff --git a/src/nnet3/nnet-training.h b/src/nnet3/nnet-training.h index b321a84d2d8..4dc6f667a64 100644 --- a/src/nnet3/nnet-training.h +++ b/src/nnet3/nnet-training.h @@ -113,6 +113,7 @@ struct ObjectiveValues { ObjectiveValues(const std::vector &values); int32 Size() const { return objective_values.size(); } + void Resize(int32 size); void Add(const ObjectiveValues &other); From d811e1570bc36f25c4ed9fe0c2046e4f4deaa8be Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Fri, 1 Sep 2017 12:21:46 -0400 Subject: [PATCH 056/174] semisup: Minor fixes to chain semisup --- egs/fisher_english/s5/cmd.sh | 3 +- .../run_tdnn_11k_semisupervised_conf_a.sh | 8 +- .../run_tdnn_11k_semisupervised_conf_b.sh | 6 +- .../run_tdnn_11k_semisupervised_conf_d.sh | 7 +- .../run_tdnn_11k_semisupervised_conf_e.sh | 74 ++++++++++++++++++- .../run_tdnn_11k_semisupervised_conf_f.sh | 5 ++ .../run_tdnn_11k_semisupervised_conf_g.sh | 7 +- egs/fisher_english/s5/path.sh | 2 + .../nnet3/train/chain_objf/acoustic_model.py | 2 +- egs/wsj/s5/steps/libs/nnet3/train/common.py | 5 ++ egs/wsj/s5/steps/lmrescore_rnnlm_lat.sh | 17 ++++- egs/wsj/s5/steps/nnet3/align_lats.sh | 8 ++ egs/wsj/s5/steps/nnet3/decode.sh | 2 +- egs/wsj/s5/steps/nnet3/get_degs.sh | 6 +- .../s5/steps/nnet3/train_discriminative.sh | 24 +++--- egs/wsj/s5/steps/nnet3/xconfig_to_configs.py | 2 +- egs/wsj/s5/steps/scoring/score_kaldi_wer.sh | 57 +++++++------- egs/wsj/s5/steps/tfrnnlm/vanilla_rnnlm.py | 6 +- egs/wsj/s5/utils/mkgraph.sh | 22 +++++- src/latbin/Makefile | 3 +- src/tfrnnlm/Makefile | 2 +- 21 files changed, 209 insertions(+), 59 deletions(-) mode change 100644 => 100755 egs/wsj/s5/steps/tfrnnlm/vanilla_rnnlm.py diff --git a/egs/fisher_english/s5/cmd.sh b/egs/fisher_english/s5/cmd.sh index 88db78823a5..44ec34bcd61 100644 --- a/egs/fisher_english/s5/cmd.sh +++ b/egs/fisher_english/s5/cmd.sh @@ -11,5 +11,6 @@ # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. export train_cmd="queue.pl --mem 4G" -export decode_cmd="queue.pl --mem 4G" +export decode_cmd="queue.pl --mem 8G" export mkgraph_cmd="queue.pl --mem 8G" +export tfrnnlm_cmd="queue.pl -l hostname=b*" # this is specific to the CLSP grid diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_a.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_a.sh index 60f64dee299..245f25641b9 100644 --- a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_a.sh +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_a.sh @@ -1,6 +1,12 @@ #!/bin/bash -# This script is same as _f, but fixes the bug about acwt for best path. +# This script is for semi-supervised training with 250h unsupervised set and +# around 10-15h supervised set. +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 0.3 +# Weights for phone LM (supervised, unsupervises): 5,2 +# LM for decoding unsupervised data: 3gram set -u -e -o pipefail diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_b.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_b.sh index f106549167f..9b7a424b897 100644 --- a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_b.sh +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_b.sh @@ -1,6 +1,10 @@ #!/bin/bash -# This script is same as _f, but fixes the bug about acwt for best path. +# This script is same as _a, but uses no deriv weights. +# unsup_frames_per_eg=150 +# Deriv weights: None +# Unsupervised weight: 0.3 +# Weights for phone LM (supervised, unsupervises): 5,2 set -u -e -o pipefail diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_d.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_d.sh index 780c783c87f..23c58768b04 100644 --- a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_d.sh +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_d.sh @@ -1,6 +1,11 @@ #!/bin/bash -# This script is same as _f, but fixes the bug about acwt for best path. +# This script is same as _a, but uses 4gram LM for generating unsupervised data lattices. +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 0.3 +# Weights for phone LM (supervised, unsupervises): 5,2 +# LM for decoding unsupervised data: 4gram set -u -e -o pipefail diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_e.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_e.sh index 9f2a2a8993b..b8caaa53dea 100644 --- a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_e.sh +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_e.sh @@ -1,6 +1,11 @@ #!/bin/bash -# This script is same as _f, but fixes the bug about acwt for best path. +# This script is same as _d, but uses a weight of 1.0 for unsupervised set. +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 5,2 +# LM for decoding unsupervised data: 4gram set -u -e -o pipefail @@ -46,6 +51,14 @@ minibatch_size=128 # frames_per_eg for unsupervised decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" # End configuration section. echo "$0 $@" # Print the command line for logging @@ -365,10 +378,69 @@ if [ $stage -le 18 ]; then steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${lang_test_suffix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; ) & done fi + wait; exit 0; diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_f.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_f.sh index 346c5e6eede..ac986ce6dda 100644 --- a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_f.sh +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_f.sh @@ -1,6 +1,11 @@ #!/bin/bash # This script is same as _e, but is run for 3 epochs instead of 4. +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 5,2 +# LM for decoding unsupervised data: 4gram set -u -e -o pipefail diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_g.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_g.sh index ccca9c6d334..12909f33e15 100644 --- a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_g.sh +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_g.sh @@ -1,6 +1,11 @@ #!/bin/bash -# This script is same as _e, but is run for 3 epochs instead of 4. +# This script is same as _f, but uses 300 frames-per-eg +# unsup_frames_per_eg=300 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 5,2 +# LM for decoding unsupervised data: 4gram set -u -e -o pipefail diff --git a/egs/fisher_english/s5/path.sh b/egs/fisher_english/s5/path.sh index 1a6fb5f891b..0b0b53d8e5d 100755 --- a/egs/fisher_english/s5/path.sh +++ b/egs/fisher_english/s5/path.sh @@ -2,4 +2,6 @@ export KALDI_ROOT=`pwd`/../../.. export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH [ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 . $KALDI_ROOT/tools/config/common_path.sh +export PYTHONPATH=$PYTHONPATH:$KALDI_ROOT/tools/tensorflow_build/.local/lib/python2.7/site-packages +export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$KALDI_ROOT/tools/tensorflow/bazel-bin/tensorflow/:/usr/local/cuda/lib64:/export/a11/hlyu/cudnn/lib64:/home/dpovey/libs/ export LC_ALL=C diff --git a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py index b38da4ac566..a9d38c33182 100644 --- a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py +++ b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py @@ -390,7 +390,7 @@ def check_for_required_files(feat_dir, tree_dir, lat_dir): files = ['{0}/feats.scp'.format(feat_dir), '{0}/ali.1.gz'.format(tree_dir), '{0}/final.mdl'.format(tree_dir), '{0}/tree'.format(tree_dir), '{0}/lat.1.gz'.format(lat_dir), '{0}/final.mdl'.format(lat_dir), - '{0}/num_jobs'.format(lat_dir), '{0}/splice_opts'.format(lat_dir)] + '{0}/num_jobs'.format(lat_dir)] for file in files: if not os.path.isfile(file): raise Exception('Expected {0} to exist.'.format(file)) diff --git a/egs/wsj/s5/steps/libs/nnet3/train/common.py b/egs/wsj/s5/steps/libs/nnet3/train/common.py index 30ebac00c4e..837500fe927 100644 --- a/egs/wsj/s5/steps/libs/nnet3/train/common.py +++ b/egs/wsj/s5/steps/libs/nnet3/train/common.py @@ -457,6 +457,11 @@ def verify_egs_dir(egs_dir, feat_dim, ivector_dim, ivector_extractor_id, left_context_initial = left_context if right_context_final == -1: right_context_final = right_context + if egs_left_context_initial == -1: + egs_left_context_initial = egs_left_context + if egs_right_context_final == -1: + egs_right_context_final = egs_right_context + # the condition on the initial/final context is an equality condition, # not an inequality condition, as there is no mechanism to 'correct' the diff --git a/egs/wsj/s5/steps/lmrescore_rnnlm_lat.sh b/egs/wsj/s5/steps/lmrescore_rnnlm_lat.sh index d3e6ca73dd4..3d0e50fc4c0 100755 --- a/egs/wsj/s5/steps/lmrescore_rnnlm_lat.sh +++ b/egs/wsj/s5/steps/lmrescore_rnnlm_lat.sh @@ -14,6 +14,9 @@ max_ngram_order=4 N=10 inv_acwt=12 weight=1.0 # Interpolation weight for RNNLM. + +expand_ngram=false +beam= # End configuration section. rnnlm_ver= #layer_string= @@ -89,11 +92,21 @@ mkdir -p $outdir/log nj=`cat $indir/num_jobs` || exit 1; cp $indir/num_jobs $outdir +lat="ark:gunzip -c $indir/lat.JOB.gz |" + +if $expand_ngram; then + lat="$lat lattice-expand-ngram --n=$max_ngram_order ark:- ark:- |" +fi + +if [ ! -z "$beam" ]; then + lat="$lat lattice-prune --inv-acoustic-scale=$inv_acwt --beam=$beam ark:- ark:- |" +fi + oldlm_weight=`perl -e "print -1.0 * $weight;"` if [ "$oldlm" == "$oldlang/G.fst" ]; then $cmd JOB=1:$nj $outdir/log/rescorelm.JOB.log \ lattice-lmrescore --lm-scale=$oldlm_weight \ - "ark:gunzip -c $indir/lat.JOB.gz|" "$oldlm_command" ark:- \| \ + "$lat" "$oldlm_command" ark:- \| \ $rescoring_binary $extra_arg --lm-scale=$weight \ --max-ngram-order=$max_ngram_order \ $first_arg $oldlang/words.txt ark:- "$rnnlm_dir/rnnlm" \ @@ -101,7 +114,7 @@ if [ "$oldlm" == "$oldlang/G.fst" ]; then else $cmd JOB=1:$nj $outdir/log/rescorelm.JOB.log \ lattice-lmrescore-const-arpa --lm-scale=$oldlm_weight \ - "ark:gunzip -c $indir/lat.JOB.gz|" "$oldlm_command" ark:- \| \ + "$lat" "$oldlm_command" ark:- \| \ $rescoring_binary $extra_arg --lm-scale=$weight \ --max-ngram-order=$max_ngram_order \ $first_arg $oldlang/words.txt ark:- "$rnnlm_dir/rnnlm" \ diff --git a/egs/wsj/s5/steps/nnet3/align_lats.sh b/egs/wsj/s5/steps/nnet3/align_lats.sh index f931b826b7b..06b4520e6f9 100755 --- a/egs/wsj/s5/steps/nnet3/align_lats.sh +++ b/egs/wsj/s5/steps/nnet3/align_lats.sh @@ -24,6 +24,7 @@ extra_left_context_initial=-1 extra_right_context_final=-1 online_ivector_dir= graphs_scp= +write_best_path_alignments=false # End configuration options. echo "$0 $@" # Print the command line for logging @@ -166,4 +167,11 @@ if [ $stage -le 1 ]; then "$feats" "ark:|gzip -c >$dir/lat.JOB.gz" || exit 1; fi +if [ $stage -le 2 ] && $write_best_path_alignments; then + $cmd JOB=1:$nj $dir/log/best_path.JOB.log \ + lattice-best-path --acoustic-scale=$acoustic_scale \ + "ark:gunzip -c $dir/lat.JOB.gz |" ark:/dev/null \ + "ark:|gzip -c > $dir/ali.JOB.gz" || exit 1; +fi + echo "$0: done aligning data." diff --git a/egs/wsj/s5/steps/nnet3/decode.sh b/egs/wsj/s5/steps/nnet3/decode.sh index d1c7d24d829..22fd03baa76 100755 --- a/egs/wsj/s5/steps/nnet3/decode.sh +++ b/egs/wsj/s5/steps/nnet3/decode.sh @@ -37,7 +37,7 @@ minimize=false echo "$0 $@" # Print the command line for logging [ -f ./path.sh ] && . ./path.sh; # source the path. -. parse_options.sh || exit 1; +. utils/parse_options.sh || exit 1; if [ $# -ne 3 ]; then echo "Usage: $0 [options] " diff --git a/egs/wsj/s5/steps/nnet3/get_degs.sh b/egs/wsj/s5/steps/nnet3/get_degs.sh index 44a7886fc3f..63271e3adaa 100755 --- a/egs/wsj/s5/steps/nnet3/get_degs.sh +++ b/egs/wsj/s5/steps/nnet3/get_degs.sh @@ -143,9 +143,11 @@ echo "$0: feature type is raw" cmvn_opts=$(cat $srcdir/cmvn_opts) || exit 1 + feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |" -cp $srcdir/{splice_opts,cmvn_opts} $dir 2>/dev/null || true +cp $srcdir/{tree,cmvn_opts} $dir || exit 1 +cp $srcdir/splice_opts $dir 2>/dev/null || true if [ ! -z "$transform_dir" ]; then echo "$0: using transforms from $transform_dir" @@ -332,7 +334,7 @@ fi # set the command to determinize lattices, if specified. if $determinize_before_split; then - lattice_determinize_cmd="lattice-determinize-non-compact --acoustic-scale=$acwt --max-mem=$max_mem --minimize=true --prune=true --beam=$lattice_beam ark:- ark:-" + lattice_determinize_cmd="lattice-determinize-pruned-non-compact --acoustic-scale=$acwt --max-mem=$max_mem --minimize=true --beam=$lattice_beam ark:- ark:-" else lattice_determinize_cmd="cat" fi diff --git a/egs/wsj/s5/steps/nnet3/train_discriminative.sh b/egs/wsj/s5/steps/nnet3/train_discriminative.sh index f779c2041cb..4bd6f6b9456 100755 --- a/egs/wsj/s5/steps/nnet3/train_discriminative.sh +++ b/egs/wsj/s5/steps/nnet3/train_discriminative.sh @@ -54,6 +54,7 @@ cleanup=true keep_model_iters=100 remove_egs=false src_model= # will default to $degs_dir/final.mdl +adjust_priors=true # Set it to false for 'chain' models num_jobs_compute_prior=10 @@ -330,18 +331,19 @@ while [ $x -lt $num_iters ]; do e=${iter_to_epoch[$x]} ln -sf $x.mdl $dir/epoch$e.mdl - ( - rm $dir/.error 2> /dev/null - - steps/nnet3/adjust_priors.sh --egs-type degs \ - --num-jobs-compute-prior $num_jobs_compute_prior \ - --cmd "$cmd" --use-gpu false \ - --minibatch-size $minibatch_size \ - --use-raw-nnet false --iter epoch$e $dir $degs_dir \ - || { touch $dir/.error; echo "Error in adjusting priors. See errors above."; exit 1; } - ) & + if $adjust_priors; then + ( + rm $dir/.error 2> /dev/null + + steps/nnet3/adjust_priors.sh --egs-type degs \ + --num-jobs-compute-prior $num_jobs_compute_prior \ + --cmd "$cmd" --use-gpu false \ + --minibatch-size $minibatch_size \ + --use-raw-nnet false --iter epoch$e $dir $degs_dir \ + || { touch $dir/.error; echo "Error in adjusting priors. See errors above."; exit 1; } + ) & + fi fi - done rm $dir/final.mdl 2>/dev/null diff --git a/egs/wsj/s5/steps/nnet3/xconfig_to_configs.py b/egs/wsj/s5/steps/nnet3/xconfig_to_configs.py index 3c81ec12b95..87eaebfd45e 100755 --- a/egs/wsj/s5/steps/nnet3/xconfig_to_configs.py +++ b/egs/wsj/s5/steps/nnet3/xconfig_to_configs.py @@ -317,7 +317,7 @@ def main(): aux_layers = [] if args.existing_model is not None: aux_layers = xparser.get_model_component_info(args.existing_model) - all_layers = xparser.read_xconfig_file(args.xconfig_file, aux_layers) + all_layers = xparser.read_xconfig_file(args.xconfig_file, aux_layers) write_expanded_xconfig_files(args.config_dir, all_layers) write_config_files(args.config_dir, all_layers) check_model_contexts(args.config_dir, args.nnet_edits, diff --git a/egs/wsj/s5/steps/scoring/score_kaldi_wer.sh b/egs/wsj/s5/steps/scoring/score_kaldi_wer.sh index 9988c941441..9393f5616c5 100755 --- a/egs/wsj/s5/steps/scoring/score_kaldi_wer.sh +++ b/egs/wsj/s5/steps/scoring/score_kaldi_wer.sh @@ -16,6 +16,7 @@ word_ins_penalty=0.0,0.5,1.0 min_lmwt=7 max_lmwt=17 iter=final +scoring_affix=_kaldi #end configuration section. echo "$0 $@" # Print the command line for logging @@ -59,15 +60,15 @@ else fi -mkdir -p $dir/scoring_kaldi -cat $data/text | $ref_filtering_cmd > $dir/scoring_kaldi/test_filt.txt || exit 1; +mkdir -p $dir/scoring${scoring_affix} +cat $data/text | $ref_filtering_cmd > $dir/scoring${scoring_affix}/test_filt.txt || exit 1; if [ $stage -le 0 ]; then for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do - mkdir -p $dir/scoring_kaldi/penalty_$wip/log + mkdir -p $dir/scoring${scoring_affix}/penalty_$wip/log if $decode_mbr ; then - $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring_kaldi/penalty_$wip/log/best_path.LMWT.log \ + $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring${scoring_affix}/penalty_$wip/log/best_path.LMWT.log \ acwt=\`perl -e \"print 1.0/LMWT\"\`\; \ lattice-scale --inv-acoustic-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \ lattice-add-penalty --word-ins-penalty=$wip ark:- ark:- \| \ @@ -75,21 +76,21 @@ if [ $stage -le 0 ]; then lattice-mbr-decode --word-symbol-table=$symtab \ ark:- ark,t:- \| \ utils/int2sym.pl -f 2- $symtab \| \ - $hyp_filtering_cmd '>' $dir/scoring_kaldi/penalty_$wip/LMWT.txt || exit 1; + $hyp_filtering_cmd '>' $dir/scoring${scoring_affix}/penalty_$wip/LMWT.txt || exit 1; else - $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring_kaldi/penalty_$wip/log/best_path.LMWT.log \ + $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring${scoring_affix}/penalty_$wip/log/best_path.LMWT.log \ lattice-scale --inv-acoustic-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \ lattice-add-penalty --word-ins-penalty=$wip ark:- ark:- \| \ lattice-best-path --word-symbol-table=$symtab ark:- ark,t:- \| \ utils/int2sym.pl -f 2- $symtab \| \ - $hyp_filtering_cmd '>' $dir/scoring_kaldi/penalty_$wip/LMWT.txt || exit 1; + $hyp_filtering_cmd '>' $dir/scoring${scoring_affix}/penalty_$wip/LMWT.txt || exit 1; fi - $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring_kaldi/penalty_$wip/log/score.LMWT.log \ - cat $dir/scoring_kaldi/penalty_$wip/LMWT.txt \| \ + $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring${scoring_affix}/penalty_$wip/log/score.LMWT.log \ + cat $dir/scoring${scoring_affix}/penalty_$wip/LMWT.txt \| \ compute-wer --text --mode=present \ - ark:$dir/scoring_kaldi/test_filt.txt ark,p:- ">&" $dir/wer_LMWT_$wip || exit 1; + ark:$dir/scoring${scoring_affix}/test_filt.txt ark,p:- ">&" $dir/wer_LMWT_$wip || exit 1; done fi @@ -103,9 +104,9 @@ if [ $stage -le 1 ]; then # adding /dev/null to the command list below forces grep to output the filename grep WER $dir/wer_${lmwt}_${wip} /dev/null done - done | utils/best_wer.sh >& $dir/scoring_kaldi/best_wer || exit 1 + done | utils/best_wer.sh >& $dir/scoring${scoring_affix}/best_wer || exit 1 - best_wer_file=$(awk '{print $NF}' $dir/scoring_kaldi/best_wer) + best_wer_file=$(awk '{print $NF}' $dir/scoring${scoring_affix}/best_wer) best_wip=$(echo $best_wer_file | awk -F_ '{print $NF}') best_lmwt=$(echo $best_wer_file | awk -F_ '{N=NF-1; print $N}') @@ -115,25 +116,25 @@ if [ $stage -le 1 ]; then fi if $stats; then - mkdir -p $dir/scoring_kaldi/wer_details - echo $best_lmwt > $dir/scoring_kaldi/wer_details/lmwt # record best language model weight - echo $best_wip > $dir/scoring_kaldi/wer_details/wip # record best word insertion penalty - - $cmd $dir/scoring_kaldi/log/stats1.log \ - cat $dir/scoring_kaldi/penalty_$best_wip/$best_lmwt.txt \| \ - align-text --special-symbol="'***'" ark:$dir/scoring_kaldi/test_filt.txt ark:- ark,t:- \| \ - utils/scoring/wer_per_utt_details.pl --special-symbol "'***'" \| tee $dir/scoring_kaldi/wer_details/per_utt \|\ - utils/scoring/wer_per_spk_details.pl $data/utt2spk \> $dir/scoring_kaldi/wer_details/per_spk || exit 1; - - $cmd $dir/scoring_kaldi/log/stats2.log \ - cat $dir/scoring_kaldi/wer_details/per_utt \| \ + mkdir -p $dir/scoring${scoring_affix}/wer_details + echo $best_lmwt > $dir/scoring${scoring_affix}/wer_details/lmwt # record best language model weight + echo $best_wip > $dir/scoring${scoring_affix}/wer_details/wip # record best word insertion penalty + + $cmd $dir/scoring${scoring_affix}/log/stats1.log \ + cat $dir/scoring${scoring_affix}/penalty_$best_wip/$best_lmwt.txt \| \ + align-text --special-symbol="'***'" ark:$dir/scoring${scoring_affix}/test_filt.txt ark:- ark,t:- \| \ + utils/scoring/wer_per_utt_details.pl --special-symbol "'***'" \| tee $dir/scoring${scoring_affix}/wer_details/per_utt \|\ + utils/scoring/wer_per_spk_details.pl $data/utt2spk \> $dir/scoring${scoring_affix}/wer_details/per_spk || exit 1; + + $cmd $dir/scoring${scoring_affix}/log/stats2.log \ + cat $dir/scoring${scoring_affix}/wer_details/per_utt \| \ utils/scoring/wer_ops_details.pl --special-symbol "'***'" \| \ - sort -b -i -k 1,1 -k 4,4rn -k 2,2 -k 3,3 \> $dir/scoring_kaldi/wer_details/ops || exit 1; + sort -b -i -k 1,1 -k 4,4rn -k 2,2 -k 3,3 \> $dir/scoring${scoring_affix}/wer_details/ops || exit 1; - $cmd $dir/scoring_kaldi/log/wer_bootci.log \ + $cmd $dir/scoring${scoring_affix}/log/wer_bootci.log \ compute-wer-bootci --mode=present \ - ark:$dir/scoring_kaldi/test_filt.txt ark:$dir/scoring_kaldi/penalty_$best_wip/$best_lmwt.txt \ - '>' $dir/scoring_kaldi/wer_details/wer_bootci || exit 1; + ark:$dir/scoring${scoring_affix}/test_filt.txt ark:$dir/scoring${scoring_affix}/penalty_$best_wip/$best_lmwt.txt \ + '>' $dir/scoring${scoring_affix}/wer_details/wer_bootci || exit 1; fi fi diff --git a/egs/wsj/s5/steps/tfrnnlm/vanilla_rnnlm.py b/egs/wsj/s5/steps/tfrnnlm/vanilla_rnnlm.py old mode 100644 new mode 100755 index de263c6923f..48d081058c4 --- a/egs/wsj/s5/steps/tfrnnlm/vanilla_rnnlm.py +++ b/egs/wsj/s5/steps/tfrnnlm/vanilla_rnnlm.py @@ -14,7 +14,7 @@ # limitations under the License. # ============================================================================== -# this script trains a vanilla RNNLM with TensorFlow. +# this script trains a vanilla RNNLM with TensorFlow. # to call the script, do # python steps/tfrnnlm/vanilla_rnnlm.py --data-path=$datadir \ # --save-path=$savepath --vocab-path=$rnn.wordlist [--hidden-size=$size] @@ -39,6 +39,7 @@ logging = tf.logging flags.DEFINE_integer("hidden-size", 200, "hidden dim of RNN") +flags.DEFINE_integer("max-max-epoch", 20, "maximum number of epochs") flags.DEFINE_string("data-path", None, "Where the training/test data is stored.") @@ -120,7 +121,7 @@ def attn_cell(): test_word_in = tf.placeholder(tf.int32, [1, 1], name="test_word_in") state_placeholder = tf.placeholder(tf.float32, [config.num_layers, 1, size], name="test_state_in") - # unpacking the input state context + # unpacking the input state context l = tf.unstack(state_placeholder, axis=0) test_input_state = tuple( [l[idx] for idx in range(config.num_layers)] @@ -281,6 +282,7 @@ def main(_): config = get_config() config.hidden_size = FLAGS.hidden_size + config.max_max_epoch = FLAGS.max_max_epoch config.vocab_size = len(word_map) eval_config = get_config() eval_config.batch_size = 1 diff --git a/egs/wsj/s5/utils/mkgraph.sh b/egs/wsj/s5/utils/mkgraph.sh index 1becfc45be3..3d06436e0a3 100755 --- a/egs/wsj/s5/utils/mkgraph.sh +++ b/egs/wsj/s5/utils/mkgraph.sh @@ -19,13 +19,15 @@ tscale=1.0 loopscale=0.1 remove_oov=false +unk_prob_scale=1.0 -for x in `seq 4`; do +for x in `seq 5`; do [ "$1" == "--mono" -o "$1" == "--left-biphone" -o "$1" == "--quinphone" ] && shift && \ echo "WARNING: the --mono, --left-biphone and --quinphone options are now deprecated and ignored." [ "$1" == "--remove-oov" ] && remove_oov=true && shift; [ "$1" == "--transition-scale" ] && tscale=$2 && shift 2; [ "$1" == "--self-loop-scale" ] && loopscale=$2 && shift 2; + [ "$1" == "--unk-prob-scale" ] && unk_prob_scale=$2 && shift 2; done if [ $# != 3 ]; then @@ -79,12 +81,26 @@ P=$(tree-info $tree | grep "central-position" | cut -d' ' -f2) || { echo "Error echo "$0: WARNING: chain models need '--self-loop-scale 1.0'"; mkdir -p $lang/tmp + +G_fst=$lang/G.fst +if [ $unk_prob_scale != 1.0 ]; then + oov_symbol=`cat $lang/oov.int` + fstprint $lang/G.fst | \ + awk -v oov_symbol=$oov_symbol -v unk_scale=$unk_prob_scale '{ + if ($4 == oov_symbol) { + $5 = $5 - log(unk_scale); + } + print $0; + }' | fstcompile > $graph_dir/G_tmp.fst + G_fst=$graph_dir/G_tmp.fst +fi + trap "rm -f $lang/tmp/LG.fst.$$" EXIT HUP INT PIPE TERM # Note: [[ ]] is like [ ] but enables certain extra constructs, e.g. || in # place of -o -if [[ ! -s $lang/tmp/LG.fst || $lang/tmp/LG.fst -ot $lang/G.fst || \ +if [[ ! -s $lang/tmp/LG.fst || $lang/tmp/LG.fst -ot $G_fst || \ $lang/tmp/LG.fst -ot $lang/L_disambig.fst ]]; then - fsttablecompose $lang/L_disambig.fst $lang/G.fst | fstdeterminizestar --use-log=true | \ + fsttablecompose $lang/L_disambig.fst $G_fst | fstdeterminizestar --use-log=true | \ fstminimizeencoded | fstpushspecial | \ fstarcsort --sort_type=ilabel > $lang/tmp/LG.fst.$$ || exit 1; mv $lang/tmp/LG.fst.$$ $lang/tmp/LG.fst diff --git a/src/latbin/Makefile b/src/latbin/Makefile index 43210c0d3e0..3885db0af71 100644 --- a/src/latbin/Makefile +++ b/src/latbin/Makefile @@ -21,7 +21,8 @@ BINFILES = lattice-best-path lattice-prune lattice-equivalent lattice-to-nbest \ lattice-confidence lattice-determinize-phone-pruned \ lattice-determinize-phone-pruned-parallel lattice-expand-ngram \ lattice-lmrescore-const-arpa lattice-lmrescore-rnnlm nbest-to-prons \ - lattice-arc-post lattice-determinize-non-compact + lattice-arc-post lattice-determinize-non-compact \ + lattice-determinize-pruned-non-compact OBJFILES = diff --git a/src/tfrnnlm/Makefile b/src/tfrnnlm/Makefile index 74c27725c93..6550d9bdc26 100644 --- a/src/tfrnnlm/Makefile +++ b/src/tfrnnlm/Makefile @@ -27,7 +27,7 @@ TESTFILES = LIBNAME = kaldi-tensorflow-rnnlm -ADDLIBS = ../lm/kaldi-lm.a ../util/kaldi-util.a ../thread/kaldi-thread.a \ +ADDLIBS = ../lm/kaldi-lm.a ../util/kaldi-util.a \ ../matrix/kaldi-matrix.a ../base/kaldi-base.a \ LDLIBS += -lz -ldl -fPIC -lrt From af050b6a228d988f6a168564c291c2c589a1d203 Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Fri, 1 Sep 2017 12:41:37 -0400 Subject: [PATCH 057/174] semisup: Add more recipes --- .../run_tdnn_11k_semisupervised_conf_h.sh | 385 ++++++++++++++ .../run_tdnn_11k_semisupervised_conf_i.sh | 381 ++++++++++++++ .../run_tdnn_11k_semisupervised_conf_j.sh | 465 +++++++++++++++++ .../run_tdnn_11k_semisupervised_conf_k.sh | 406 +++++++++++++++ .../run_tdnn_11k_semisupervised_conf_l.sh | 478 ++++++++++++++++++ .../s5/local/semisup/run_20k.sh | 64 +++ .../lattice-determinize-pruned-non-compact.cc | 259 ++++++++++ 7 files changed, 2438 insertions(+) create mode 100644 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_h.sh create mode 100644 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_i.sh create mode 100644 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_j.sh create mode 100644 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_k.sh create mode 100644 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_l.sh create mode 100644 egs/fisher_english/s5/local/semisup/run_20k.sh create mode 100644 src/latbin/lattice-determinize-pruned-non-compact.cc diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_h.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_h.sh new file mode 100644 index 00000000000..492f992c6f0 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_h.sh @@ -0,0 +1,385 @@ +#!/bin/bash + +# This script is same as _d, but uses a weight of 1.0 for unsupervised set. +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 5,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup11k_250k # for reference +exp=exp/semisup_11k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup11k +semi_affix=semi11k_250k # affix relating train-set splitting proportion + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.0 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=2.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=2 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1h # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix= +rnnlm_weight=0.5 +rnnlm_dir=data/tf_fast_lstm_ex250k +rnnlm_affix=unk.fast.tfrnnlm + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 +# to tune: +# frames_per_eg for unsupervised + +decode_iter= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +tree_affix=${tree_affix:+${tree_affix}_}${semi_affix} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor + + steps/nnet3/chain/build_tree_multiple_sources.sh \ + --frame-subsampling-factor $frame_subsampling_factor \ + --use-fmllr false \ + --cmd "$train_cmd" 10000 data/lang_chain \ + data/${supervised_set} $sup_ali_dir \ + data/${unsupervised_set} \ + $chaindir/best_path_${unsupervised_set}${decode_affix} \ + $treedir +fi + +dir=$exp/chain${nnet3_affix}/tdnn${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_den_fst.sh --weights $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output-1 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + output-layer name=output-1-xent input=prefinal-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ + cp $dir/configs/final.config{,.orig} + + cat $dir/configs/final.config.orig | \ + perl -pe 's/component=output-1.affine/component=output-0.affine/g; + s/component=output-1-xent.affine/component=output-0-xent.affine/g;' > \ + $dir/configs/final.config +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri3_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=128/300=64" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_i.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_i.sh new file mode 100644 index 00000000000..6afb3e2276f --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_i.sh @@ -0,0 +1,381 @@ +#!/bin/bash + +# This script is same as _f, but uses a separate silence tolerance. +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 5,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup11k_250k # for reference +exp=exp/semisup_11k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup11k +semi_affix=semi11k_250k # affix relating train-set splitting proportion + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.0 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=2.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=2 +sil_tolerance=1 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1i # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix=fg + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 +# to tune: +# frames_per_eg for unsupervised + +decode_iter= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance}_sil${sil_tolerance} +tree_affix=${tree_affix}_${semi_affix} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor + + steps/nnet3/chain/build_tree_multiple_sources.sh \ + --frame-subsampling-factor $frame_subsampling_factor \ + --use-fmllr false \ + --cmd "$train_cmd" 10000 data/lang_chain \ + data/${supervised_set} $sup_ali_dir \ + data/${unsupervised_set} \ + $chaindir/best_path_${unsupervised_set}${decode_affix} \ + $treedir +fi + +dir=$exp/chain${nnet3_affix}/tdnn${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_den_fst.sh --weights $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output-1 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + output-layer name=output-1-xent input=prefinal-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ + cp $dir/configs/final.config{,.orig} + + cat $dir/configs/final.config.orig | \ + perl -pe 's/component=output-1.affine/component=output-0.affine/g; + s/component=output-1-xent.affine/component=output-0-xent.affine/g;' > \ + $dir/configs/final.config +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri3_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-tolerance-silence $sil_tolerance --right-tolerance-silence $sil_tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=128/300=64" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 3 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_j.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_j.sh new file mode 100644 index 00000000000..c3c0db77856 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_j.sh @@ -0,0 +1,465 @@ +#!/bin/bash + +# This script is same as _f, but uses UNK model. +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 5,2 +# LM for decoding unsupervised data: 4gram with UNK model + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup11k_250k # for reference +exp=exp/semisup_11k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup11k +semi_affix=semi11k_250k # affix relating train-set splitting proportion + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.0 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=2.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=2 +sil_tolerance= +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1j # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix=fg + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 +# to tune: +# frames_per_eg for unsupervised + +decode_iter= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance}${sil_tolerance:+_sil$sil_tolerance} +tree_affix=${tree_affix}_${semi_affix} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor + + steps/nnet3/chain/build_tree_multiple_sources.sh \ + --frame-subsampling-factor $frame_subsampling_factor \ + --use-fmllr false \ + --cmd "$train_cmd" 10000 data/lang_chain \ + data/${supervised_set} $sup_ali_dir \ + data/${unsupervised_set} \ + $chaindir/best_path_${unsupervised_set}${decode_affix} \ + $treedir +fi + +dir=$exp/chain${nnet3_affix}/tdnn${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_den_fst.sh --weights $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output-1 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + output-layer name=output-1-xent input=prefinal-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ + cp $dir/configs/final.config{,.orig} + + cat $dir/configs/final.config.orig | \ + perl -pe 's/component=output-1.affine/component=output-0.affine/g; + s/component=output-1-xent.affine/component=output-0-xent.affine/g;' > \ + $dir/configs/final.config +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri3_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + ${sil_tolerance:+--left-tolerance-silence $sil_tolerance --right-tolerance-silence $sil_tolerance} \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=128/300=64" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 3 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_k.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_k.sh new file mode 100644 index 00000000000..8f83f1a3529 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_k.sh @@ -0,0 +1,406 @@ +#!/bin/bash + +# This script is same as _f, but uses UNK model. +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 5,2 +# LM for decoding unsupervised data: 4gram with UNK model + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup11k_250k # for reference +exp=exp/semisup_11k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup11k +semi_affix=semi11k_250k # affix relating train-set splitting proportion + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.0 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=2.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=2 +sil_tolerance= +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1k # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix=fg + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if [ $# -ne 0 ]; then + echo "Usage: $0" + exit 1 +fi + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance}${sil_tolerance:+_sil$sil_tolerance} +tree_affix=${tree_affix}_${semi_affix} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor + + steps/nnet3/chain/build_tree_multiple_sources.sh \ + --frame-subsampling-factor $frame_subsampling_factor \ + --use-fmllr false \ + --cmd "$train_cmd" 10000 data/lang_chain \ + data/${supervised_set} $sup_ali_dir \ + data/${unsupervised_set} \ + $chaindir/best_path_${unsupervised_set}${decode_affix} \ + $treedir +fi + +dir=$exp/chain${nnet3_affix}/tdnn${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_den_fst.sh --weights $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output-1 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + output-layer name=output-1-xent input=prefinal-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ + cp $dir/configs/final.config{,.orig} + + cat $dir/configs/final.config.orig | \ + perl -pe 's/component=output-1.affine/component=output-0.affine/g; + s/component=output-1-xent.affine/component=output-0-xent.affine/g;' > \ + $dir/configs/final.config +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri3_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + ${sil_tolerance:+--left-tolerance-silence $sil_tolerance --right-tolerance-silence $sil_tolerance} \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=128/300=64" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph${lang_test_suffix}${unk_prob_scale:+=_unkscale$unk_prob_scale} +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 ${unk_prob_scale:+--unk-prob-scale $unk_prob_scale} \ + data/lang_test${lang_test_suffix} $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${lang_test_suffix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_l.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_l.sh new file mode 100644 index 00000000000..c2772704943 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_l.sh @@ -0,0 +1,478 @@ +#!/bin/bash + +# This script is same as _f, but uses UNK model. +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 5,2 +# LM for decoding unsupervised data: 4gram with UNK model + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup11k_250k # for reference +exp=exp/semisup_11k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup11k +semi_affix=semi11k_250k # affix relating train-set splitting proportion + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.0 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=2.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=2 +sil_tolerance= +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1l # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix=fg + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 +# to tune: +# frames_per_eg for unsupervised + +decode_iter= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance}${sil_tolerance:+_sil$sil_tolerance} +tree_affix=${tree_affix}_${semi_affix} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor + + steps/subset_ali_dir.sh --cmd "$train_cmd" \ + data/${supervised_set} data/${supervised_set}_sp_hires \ + ${chaindir}_lats_${supervised_set}_sp \ + ${chaindir}_best_path_${supervised_set} || exit 1 + echo $frame_subsampling_factor > ${chaindir}_best_path_${supervised_set}/frame_subsampling_factor + + steps/nnet3/chain/build_tree_multiple_sources.sh \ + --frame-subsampling-factor $frame_subsampling_factor \ + --use-fmllr false \ + --cmd "$train_cmd" 10000 data/lang_chain \ + data/${supervised_set} ${chaindir}_best_path_${supervised_set} \ + data/${unsupervised_set} \ + $chaindir/best_path_${unsupervised_set}${decode_affix} \ + $treedir +fi + +dir=$exp/chain${nnet3_affix}/tdnn${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_den_fst.sh --weights $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output-1 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + output-layer name=output-1-xent input=prefinal-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ + cp $dir/configs/final.config{,.orig} + + cat $dir/configs/final.config.orig | \ + perl -pe 's/component=output-1.affine/component=output-0.affine/g; + s/component=output-1-xent.affine/component=output-0-xent.affine/g;' > \ + $dir/configs/final.config +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + ${sil_tolerance:+--left-tolerance-silence $sil_tolerance --right-tolerance-silence $sil_tolerance} \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + ${sil_tolerance:+--left-tolerance-silence $sil_tolerance --right-tolerance-silence $sil_tolerance} \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=128/300=64" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 3 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/run_20k.sh b/egs/fisher_english/s5/local/semisup/run_20k.sh new file mode 100644 index 00000000000..9af463cd7a2 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/run_20k.sh @@ -0,0 +1,64 @@ +#!/bin/bash + +# Copyright 2017 Vimal Manohar +# Apache 2.0 + +. cmd.sh +. path.sh + +stage=-1 +train_stage=-10 + +. utils/parse_options.sh + +set -o pipefail +exp=exp/semisup_20k + +utils/subset_data_dir.sh --speakers data/train_sup 20000 data/train_sup20k || exit 1 +utils/subset_data_dir.sh --shortest data/train_sup20k 5000 data/train_sup20k_short || exit 1 +utils/subset_data_dir.sh data/train_sup20k 10000 data/train_sup20k_half || exit 1 + +steps/train_mono.sh --nj 10 --cmd "$train_cmd" \ + data/train_sup20k_short data/lang $exp/mono0a || exit 1 + +steps/align_si.sh --nj 30 --cmd "$train_cmd" \ + data/train_sup20k_half data/lang $exp/mono0a $exp/mono0a_ali || exit 1 + +steps/train_deltas.sh --cmd "$train_cmd" \ + 2000 10000 data/train_sup20k_half data/lang $exp/mono0a_ali $exp/tri1 || exit 1 + +(utils/mkgraph.sh data/lang_test $exp/tri1 $exp/tri1/graph + steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ + $exp/tri1/graph data/dev $exp/tri1/decode_dev)& + +steps/align_si.sh --nj 30 --cmd "$train_cmd" \ + data/train_sup20k data/lang $exp/tri1 $exp/tri1_ali || exit 1; + +steps/train_lda_mllt.sh --cmd "$train_cmd" \ + 2500 15000 data/train_sup20k data/lang $exp/tri1_ali $exp/tri2 || exit 1; + +(utils/mkgraph.sh data/lang_test $exp/tri2 $exp/tri2/graph + steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ + $exp/tri2/graph data/dev $exp/tri2/decode_dev)& + +steps/align_si.sh --nj 30 --cmd "$train_cmd" \ + data/train_sup20k data/lang $exp/tri2 $exp/tri2_ali || exit 1; + +steps/train_sat.sh --cmd "$train_cmd" \ + 2500 15000 data/train_sup20k data/lang $exp/tri2_ali $exp/tri3 || exit 1; + +( + utils/mkgraph.sh data/lang_test $exp/tri3 $exp/tri3/graph + steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ + $exp/tri3/graph data/dev $exp/tri3/decode_dev +)& + +utils/combine_data.sh data/semisup20k_250k data/train_sup20k data/train_unsup250k || exit 1 + +local/semisup/chain/tuning/run_tdnn_11k.sh \ + --train-set train_sup20k \ + --nnet3-affix _semi20k_250k \ + --chain-affix _semi20k_250k \ + --stage $stage --train-stage $train_stage \ + --exp $exp \ + --ivector-train-set semisup20k_250k || exit 1 diff --git a/src/latbin/lattice-determinize-pruned-non-compact.cc b/src/latbin/lattice-determinize-pruned-non-compact.cc new file mode 100644 index 00000000000..bd70032a3c0 --- /dev/null +++ b/src/latbin/lattice-determinize-pruned-non-compact.cc @@ -0,0 +1,259 @@ +// latbin/lattice-determinize-pruned-non-compact.cc + +// Copyright 2013 Daniel Povey (Johns Hopkins University) +// 2017 Vimal Manohar + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. +#include "base/kaldi-common.h" +#include "util/common-utils.h" +#include "lat/kaldi-lattice.h" +#include "lat/determinize-lattice-pruned.h" +#include "lat/lattice-functions.h" +#include "lat/push-lattice.h" +#include "lat/minimize-lattice.h" + +namespace kaldi { + +typedef Lattice::StateId StateId; +typedef Lattice::Arc Arc; + +void ComputeAcousticScoresMap( + const Lattice &lat, + unordered_map, std::pair, + PairHasher > *acoustic_scores) { + acoustic_scores->clear(); + + std::vector state_times; + LatticeStateTimes(lat, &state_times); + + KALDI_ASSERT(lat.Start() == 0); + + for (StateId s = 0; s < lat.NumStates(); s++) { + int32 t = state_times[s]; + for (fst::ArcIterator aiter(lat, s); !aiter.Done(); + aiter.Next()) { + const Arc &arc = aiter.Value(); + const LatticeWeight &weight = arc.weight; + + int32 tid = arc.ilabel; + + if (tid != 0) { + unordered_map, std::pair, + PairHasher >::iterator it = acoustic_scores->find(std::make_pair(t, tid)); + if (it == acoustic_scores->end()) { + acoustic_scores->insert(std::make_pair(std::make_pair(t, tid), + std::make_pair(weight.Value2(), 1))); + } else { + if (it->second.second == 2 + && it->second.first / it->second.second != weight.Value2()) { + KALDI_VLOG(2) << "Transitions on the same frame have different " + << "acoustic costs for tid " << tid << "; " + << it->second.first / it->second.second + << " vs " << weight.Value2(); + } + it->second.first += weight.Value2(); + it->second.second++; + } + } else { + // Arcs with epsilon input label (tid) must have 0 acoustic cost + KALDI_ASSERT(weight.Value2() == 0); + } + } + + LatticeWeight f = lat.Final(s); + if (f != LatticeWeight::Zero()) { + // Final acoustic cost must be 0 as we are reading from + // non-determinized, non-compact lattice + KALDI_ASSERT(f.Value2() == 0.0); + } + } +} + +void ReplaceAcousticScoresFromMap( + const unordered_map, std::pair, + PairHasher > &acoustic_scores, + Lattice *lat) { + fst::TopSort(lat); + + std::vector state_times; + LatticeStateTimes(*lat, &state_times); + + KALDI_ASSERT(lat->Start() == 0); + + for (StateId s = 0; s < lat->NumStates(); s++) { + int32 t = state_times[s]; + for (fst::MutableArcIterator aiter(lat, s); + !aiter.Done(); aiter.Next()) { + Arc arc(aiter.Value()); + + int32 tid = arc.ilabel; + if (tid != 0) { + unordered_map, std::pair, + PairHasher >::const_iterator it = acoustic_scores.find(std::make_pair(t, tid)); + if (it == acoustic_scores.end()) { + KALDI_ERR << "Could not find tid " << tid << " at time " << t + << " in the acoustic scores map."; + } else { + arc.weight.SetValue2(it->second.first / it->second.second); + } + } else { + // For epsilon arcs, set acoustic cost to 0.0 + arc.weight.SetValue2(0.0); + } + aiter.SetValue(arc); + } + + LatticeWeight f = lat->Final(s); + if (f != LatticeWeight::Zero()) { + // Set final acoustic cost to 0.0 + f.SetValue2(0.0); + lat->SetFinal(s, f); + } + } +} + +} // end namespace kaldi + +int main(int argc, char *argv[]) { + try { + using namespace kaldi; + typedef kaldi::int32 int32; + + const char *usage = + "Determinize lattices, keeping only the best path (sequence of acoustic states)\n" + "for each input-symbol sequence. This version does pruning as part of the\n" + "determinization algorithm, which is more efficient and prevents blowup.\n" + "See http://kaldi-asr.org/doc/lattices.html for more information on lattices.\n" + "\n" + "Usage: lattice-determinize-pruned [options] lattice-rspecifier lattice-wspecifier\n" + " e.g.: lattice-determinize-pruned --acoustic-scale=0.1 --beam=6.0 ark:in.lats ark:det.lats\n"; + + ParseOptions po(usage); + BaseFloat acoustic_scale = 1.0; + BaseFloat beam = 10.0; + bool minimize = false; + fst::DeterminizeLatticePrunedOptions opts; // Options used in DeterminizeLatticePruned-- + // this options class does not have its own Register function as it's viewed as + // being more part of "fst world", so we register its elements independently. + opts.max_mem = 50000000; + opts.max_loop = 0; // was 500000; + + po.Register("acoustic-scale", &acoustic_scale, + "Scaling factor for acoustic likelihoods"); + po.Register("beam", &beam, "Pruning beam [applied after acoustic scaling]."); + po.Register("minimize", &minimize, + "If true, push and minimize after determinization"); + opts.Register(&po); + po.Read(argc, argv); + + if (po.NumArgs() != 2) { + po.PrintUsage(); + exit(1); + } + + std::string lats_rspecifier = po.GetArg(1), + lats_wspecifier = po.GetArg(2); + + + // Read as regular lattice-- this is the form the determinization code + // accepts. + SequentialLatticeReader lat_reader(lats_rspecifier); + + // Write as compact lattice. + LatticeWriter lat_writer(lats_wspecifier); + + int32 n_done = 0, n_warn = 0; + + // depth stats (for diagnostics). + double sum_depth_in = 0.0, + sum_depth_out = 0.0, sum_t = 0.0; + + if (acoustic_scale == 0.0) + KALDI_ERR << "Do not use a zero acoustic scale (cannot be inverted)"; + + for (; !lat_reader.Done(); lat_reader.Next()) { + std::string key = lat_reader.Key(); + Lattice lat = lat_reader.Value(); + + KALDI_VLOG(2) << "Processing lattice " << key; + + fst::ScaleLattice(fst::AcousticLatticeScale(acoustic_scale), &lat); + + // Compute a map from each (t, tid) to (sum_of_acoustic_scores, count) + unordered_map, std::pair, + PairHasher > acoustic_scores; + ComputeAcousticScoresMap(lat, &acoustic_scores); + + Invert(&lat); // so word labels are on the input side. + lat_reader.FreeCurrent(); + if (!TopSort(&lat)) { + KALDI_WARN << "Could not topologically sort lattice: this probably means it" + " has bad properties e.g. epsilon cycles. Your LM or lexicon might " + "be broken, e.g. LM with epsilon cycles or lexicon with empty words."; + } + fst::ArcSort(&lat, fst::ILabelCompare()); + CompactLattice det_clat; + if (!DeterminizeLatticePruned(lat, beam, &det_clat, opts)) { + KALDI_WARN << "For key " << key << ", determinization did not succeed" + "(partial output will be pruned tighter than the specified beam.)"; + n_warn++; + } + fst::Connect(&det_clat); + if (det_clat.NumStates() == 0) { + KALDI_WARN << "For key " << key << ", determinized and trimmed lattice " + "was empty."; + n_warn++; + } + if (minimize) { + PushCompactLatticeStrings(&det_clat); + PushCompactLatticeWeights(&det_clat); + MinimizeCompactLattice(&det_clat); + } + + int32 t; + TopSortCompactLatticeIfNeeded(&det_clat); + double depth = CompactLatticeDepth(det_clat, &t); + sum_depth_in += lat.NumStates(); + sum_depth_out += depth * t; + sum_t += t; + + Lattice out_lat; + fst::ConvertLattice(det_clat, &out_lat); + + // Replace each arc (t, tid) with the averaged acoustic score from + // the computed map + ReplaceAcousticScoresFromMap(acoustic_scores, &out_lat); + + fst::ScaleLattice(fst::AcousticLatticeScale(1.0/acoustic_scale), &out_lat); + lat_writer.Write(key, out_lat); + n_done++; + } + + if (sum_t != 0.0) { + KALDI_LOG << "Average input-lattice depth (measured at at state level) is " + << (sum_depth_in / sum_t) << ", output depth is " + << (sum_depth_out / sum_t) << ", over " << sum_t << " frames " + << " (average num-frames = " << (sum_t / n_done) << ")."; + } + KALDI_LOG << "Done " << n_done << " lattices, determinization finished " + << "earlier than specified by the beam (or output was empty) on " + << n_warn << " of these."; + return (n_done != 0 ? 0 : 1); + } catch(const std::exception &e) { + std::cerr << e.what(); + return -1; + } +} From 82daf84dd16a6dee10c912ec6608bdb97faf2543 Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Sat, 2 Sep 2017 17:58:38 -0400 Subject: [PATCH 058/174] Update xconfig_to_configs.py --- egs/wsj/s5/steps/nnet3/xconfig_to_configs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/egs/wsj/s5/steps/nnet3/xconfig_to_configs.py b/egs/wsj/s5/steps/nnet3/xconfig_to_configs.py index fa9cd317331..bbd4589620b 100755 --- a/egs/wsj/s5/steps/nnet3/xconfig_to_configs.py +++ b/egs/wsj/s5/steps/nnet3/xconfig_to_configs.py @@ -319,7 +319,7 @@ def main(): existing_layers = [] if args.existing_model is not None: existing_layers = xparser.get_model_component_info(args.existing_model) - all_layers = xparser.read_xconfig_file(args.xconfig_file, existing_layers) + all_layers = xparser.read_xconfig_file(args.xconfig_file, existing_layers) write_expanded_xconfig_files(args.config_dir, all_layers) write_config_files(args.config_dir, all_layers) check_model_contexts(args.config_dir, args.nnet_edits, From ed63b190b49cb209cb179e4722ae374df3c2e981 Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Sun, 3 Sep 2017 19:45:06 -0400 Subject: [PATCH 059/174] Update make_weighted_den_fst.sh --- egs/wsj/s5/steps/nnet3/chain/make_weighted_den_fst.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/egs/wsj/s5/steps/nnet3/chain/make_weighted_den_fst.sh b/egs/wsj/s5/steps/nnet3/chain/make_weighted_den_fst.sh index 5e48acb914b..8d989f190e7 100755 --- a/egs/wsj/s5/steps/nnet3/chain/make_weighted_den_fst.sh +++ b/egs/wsj/s5/steps/nnet3/chain/make_weighted_den_fst.sh @@ -36,7 +36,7 @@ weights= # comma-separated list of positive int valued scale weights used # i.e. "1,10" # If not specified, weight '1' is used for all phone sequences. -lm_opts='num_extra_lm_state=2000' +lm_opts='--num-extra-lm-states=2000' #end configuration section. From 125abf03fd9311d3c016e12bf54f856a9afe65f2 Mon Sep 17 00:00:00 2001 From: Pegita Date: Tue, 5 Sep 2017 20:37:32 -0400 Subject: [PATCH 060/174] fixed small issues. --- .../local/chain/tuning/run_tdnn_wsj_rm_1b.sh | 2 +- .../local/chain/tuning/run_tdnn_wsj_rm_1c.sh | 2 +- .../steps/libs/nnet3/xconfig/basic_layers.py | 25 ++++++++++--------- .../nnet3/chain/make_weighted_den_fst.sh | 13 +++++++--- src/nnet3/nnet-utils.cc | 4 +-- src/nnet3/nnet-utils.h | 6 ++--- 6 files changed, 29 insertions(+), 23 deletions(-) diff --git a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1b.sh b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1b.sh index 40e026b0552..070337c4546 100755 --- a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1b.sh +++ b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1b.sh @@ -113,7 +113,7 @@ if [ ! -z $src_ivec_extractor_dir ]; then else if [ $ivector_dim -gt 0 ]; then echo "$0: ivector is used in training the source model '$src_mdl' but no " - echo " ivector extractor dir for source model is specified." && exit 1; + echo " --src-ivec-extractor-dir option as ivector dir for source model is specified." && exit 1; fi fi diff --git a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1c.sh b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1c.sh index 8528d8afb37..6ded9917851 100755 --- a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1c.sh +++ b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1c.sh @@ -110,7 +110,7 @@ if [ ! -z $src_ivec_extractor_dir ]; then else if [ $ivector_dim -gt 0 ]; then echo "$0: ivector is used in training the source model '$src_mdl' but no " - echo " ivector extractor dir for source model is specified." && exit 1; + echo " --src-ivec-extractor-dir option as ivector dir for source model is specified." && exit 1; fi fi diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py index 10d8af6385b..80816ad0bb4 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py @@ -42,13 +42,13 @@ def __init__(self, first_token, key_to_value, all_layers): raise RuntimeError("Invalid value: name={0}".format( key_to_value['name'])) - # It is possible to have two layers with a same name in 'all_layer', if + # It is possible to have two layers with a same name in 'all_layer', if # the layer type for one of them is 'existing'. # Layers of type 'existing' are corresponding to the component-node names # in the existing model, which we are adding layers to them. # 'existing' layers are not presented in any config file, and new layer # with the same name can exist in 'all_layers'. - # e.g. It is possible to have 'output-node' with name 'output' in the + # e.g. It is possible to have 'output-node' with name 'output' in the # existing model, which is added to all_layers using layer type 'existing', # and 'output-node' of type 'output-layer' with the same name 'output' in # 'all_layers'. @@ -1089,19 +1089,20 @@ def get_full_config(self): class XconfigExistingLayer(XconfigLayerBase): """ - This class is for lines like + This class is used to internally convert component-nodes in an existing + model into lines like 'existing name=tdnn1.affine dim=40'. - - This layer contains 'dim' and 'name' and it is not presented in - any actual config files. - Layers of this type are created internally for all component nodes - in an existing neural net model to use as input to other layers. + + Layers of this type are not presented in any actual xconfig or config + files, but are created internally for all component nodes + in an existing neural net model to use as input to other layers in xconfig. (i.e. get_model_component_info function, which is called in - steps/nnet3/xconfig_to_configs.py, returns a list of 'existing' - layers for component nodes used in 'existing_model') - + steps/nnet3/xconfig_to_configs.py, parses the name and + dimension of component-nodes used in the existing model + using the nnet3-info and returns a list of 'existing' layers.) + This class is useful in cases like transferring existing model - and using {input, output, component}-nodes in this model as + and using {input, output, component}-nodes in this model as input to new layers. """ diff --git a/egs/wsj/s5/steps/nnet3/chain/make_weighted_den_fst.sh b/egs/wsj/s5/steps/nnet3/chain/make_weighted_den_fst.sh index 8d989f190e7..94629131608 100755 --- a/egs/wsj/s5/steps/nnet3/chain/make_weighted_den_fst.sh +++ b/egs/wsj/s5/steps/nnet3/chain/make_weighted_den_fst.sh @@ -49,6 +49,10 @@ if [ $# -lt 2 ]; then echo "Options: " echo " --cmd (run.pl|queue.pl...) # specify how to run the sub-processes."; echo "--lm-opts # options for phone LM generation"; + echo "--weights # comma-separated list of positive int " + echo " # weights used to scale different phone sequences" + echo " # corresponding to different alignment " + echo " # in phone LM generation."; exit 1; fi @@ -75,7 +79,8 @@ cp ${ali_dirs[0]}/tree $dir/ || exit 1 if [ -z $weights ]; then # If 'weights' is not specified, comma-separated array '1' with dim #'num_alignments' is defined as 'weights'. - for n in `seq 1 $num_alignments`;do weights="$weights,1"; done + w_arr=() + for n in `seq 1 $num_alignments`;do w_arr+=(1); done else w_arr=(${weights//,/ }) num_weights=${#w_arr[@]} @@ -87,7 +92,7 @@ fi if [ $stage -le 1 ]; then for n in `seq 0 $[num_alignments-1]`; do - w=$(echo $weights | cut -d, -f$[$n+1]) + w=${w_arr[$n]} adir=${ali_dirs[$n]} num_jobs=$(cat $adir/num_jobs) if ! [[ $w =~ ^[+]?[0-9]+$ ]] ; then @@ -96,7 +101,7 @@ if [ $stage -le 1 ]; then rm $adir/alignment_files.txt 2>/dev/null || true for x in `seq $w`;do for j in `seq $num_jobs`;do - echo $adir/ali.$j.gz >> $adir/alignment_files.txt + echo $adir/ali.$j.gz >> $dir/alignment_files.${n}.txt done done done @@ -104,7 +109,7 @@ if [ $stage -le 1 ]; then ali_dirs=\(${ali_dirs[@]}\) \; \ for n in `seq 0 $[num_alignments-1]`\; do \ adir=\${ali_dirs[\$n]} \; \ - cat \$adir/alignment_files.txt \| while read f\; do gunzip -c \$f \; done \| \ + cat $dir/alignment_files.\$n.txt \| while read f\; do gunzip -c \$f \; done \| \ ali-to-phones \$adir/final.mdl ark:- ark:- \; \ done \| \ chain-est-phone-lm $lm_opts ark:- $dir/phone_lm.fst || exit 1; diff --git a/src/nnet3/nnet-utils.cc b/src/nnet3/nnet-utils.cc index 22bb60e0754..9bad40f6fe8 100644 --- a/src/nnet3/nnet-utils.cc +++ b/src/nnet3/nnet-utils.cc @@ -653,7 +653,7 @@ void ReadEditConfig(std::istream &edit_config_is, Nnet *nnet) { num_learning_rates_set++; } } - KALDI_LOG << "Set learning rates for " << num_learning_rates_set << " nodes."; + KALDI_LOG << "Set learning rates for " << num_learning_rates_set << " components."; } else if (directive == "set-learning-rate-factor") { std::string name_pattern = "*"; // name_pattern defaults to '*' if none is given. @@ -679,7 +679,7 @@ void ReadEditConfig(std::istream &edit_config_is, Nnet *nnet) { } } KALDI_LOG << "Set learning rate factors for " << num_learning_rate_factors_set - << " nodes."; + << " components."; } else if (directive == "rename-node") { // this is a shallow renaming of a node, and it requires that the name used is // not the name of another node. diff --git a/src/nnet3/nnet-utils.h b/src/nnet3/nnet-utils.h index 47374e158a4..c8c371b2da8 100644 --- a/src/nnet3/nnet-utils.h +++ b/src/nnet3/nnet-utils.h @@ -279,12 +279,12 @@ void CollapseModel(const CollapseModelConfig &config, The same as calling remove-orphan-nodes and then remove-orphan-components. set-learning-rate [name=] learning-rate= - Sets the learning rate for any updatable nodes matching the name pattern. + Sets the learning rate for any updatable components matching the name pattern. Note: this sets the 'underlying' learning rate, i.e. it will get - multiplied by any 'learning-rate-factor' set in the nodes. + multiplied by any 'learning-rate-factor' set in the components. set-learning-rate-factor [name=] learning-rate-factor= - Sets the learning rate factor for any updatable nodes matching the name pattern. + Sets the learning rate factor for any updatable components matching the name pattern. rename-node old-name= new-name= Renames a node; this is a surface renaming that does not affect the structure From f51492b48f0d4da6dc3465cacab258fdbd82aed4 Mon Sep 17 00:00:00 2001 From: Pegita Date: Tue, 5 Sep 2017 20:44:00 -0400 Subject: [PATCH 061/174] fixed small issue. --- egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1c.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1c.sh b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1c.sh index 6ded9917851..af43b4b0491 100755 --- a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1c.sh +++ b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1c.sh @@ -162,7 +162,8 @@ fi if [ $stage -le 6 ]; then echo "$0: compute {den,normalization}.fst using weighted phone LM." - steps/nnet3/chain/make_weighted_den_fst.sh --weights $phone_lm_scales \ + steps/nnet3/chain/make_weighted_den_fst.sh --cmd "$train_cmd" \ + --weights $phone_lm_scales \ --lm-opts '--num-extra-lm-states=200' \ $src_tree_dir $lat_dir $dir || exit 1; fi From ba308eac42dd5d80a4dec0d1a29d918311579dba Mon Sep 17 00:00:00 2001 From: Pegita Date: Sun, 10 Sep 2017 11:07:25 -0400 Subject: [PATCH 062/174] modified make_weighted_den_fst.sh --- egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1c.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1c.sh b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1c.sh index af43b4b0491..38ee53bf19f 100755 --- a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1c.sh +++ b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1c.sh @@ -13,8 +13,8 @@ # Since we use phone.txt from source dataset, this can be helpful in cases # where there is a few training data in the target domain and some 4-gram phone # sequences have no count in the target domain. -# 4) It transfers all layers in already-trained model and -# re-train the last layer using target dataset, instead of replacing it +# 4) It transfers all layers in already-trained model and +# re-train the last layer using target dataset, instead of replacing it # with new randomely initialized output layer. # This script uses weight transfer as Transfer learning method From 8fae871c3079e7f84d6cc96aa61d85e5169eced3 Mon Sep 17 00:00:00 2001 From: Pegita Date: Sun, 10 Sep 2017 17:38:44 -0400 Subject: [PATCH 063/174] modified weighted_den_fst.sh --- .../steps/nnet3/chain/make_weighted_den_fst.sh | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/egs/wsj/s5/steps/nnet3/chain/make_weighted_den_fst.sh b/egs/wsj/s5/steps/nnet3/chain/make_weighted_den_fst.sh index 94629131608..947f6175469 100755 --- a/egs/wsj/s5/steps/nnet3/chain/make_weighted_den_fst.sh +++ b/egs/wsj/s5/steps/nnet3/chain/make_weighted_den_fst.sh @@ -98,21 +98,17 @@ if [ $stage -le 1 ]; then if ! [[ $w =~ ^[+]?[0-9]+$ ]] ; then echo "no positive int weight specified for alignment ${ali_dirs[$n]}" && exit 1; fi - rm $adir/alignment_files.txt 2>/dev/null || true for x in `seq $w`;do - for j in `seq $num_jobs`;do - echo $adir/ali.$j.gz >> $dir/alignment_files.${n}.txt - done - done + for j in `seq $num_jobs`;do gunzip -c $adir/ali.$j.gz; done + done | ali-to-phones $adir/final.mdl ark:- "ark:|gzip -c >$dir/phones.$n.gz" done + $cmd $dir/log/make_phone_lm_fst.log \ - ali_dirs=\(${ali_dirs[@]}\) \; \ for n in `seq 0 $[num_alignments-1]`\; do \ - adir=\${ali_dirs[\$n]} \; \ - cat $dir/alignment_files.\$n.txt \| while read f\; do gunzip -c \$f \; done \| \ - ali-to-phones \$adir/final.mdl ark:- ark:- \; \ + gunzip -c $dir/phones.\$n.gz \; \ done \| \ - chain-est-phone-lm $lm_opts ark:- $dir/phone_lm.fst || exit 1; + chain-est-phone-lm $lm_opts ark:- $dir/phon_lm.fst || exit 1; + rm $dir/phones.*.gz 2>/dev/null || true fi if [ $stage -le 2 ]; then @@ -126,4 +122,6 @@ if [ $stage -le 3 ]; then $dir/den.fst $dir/normalization.fst || exit 1 fi +echo "Successfully created {den,normalization}.fst" + exit 0 From 6f5e8eb98b94da1f37585c0e48bdcc7082a87882 Mon Sep 17 00:00:00 2001 From: Pegita Date: Mon, 11 Sep 2017 21:59:13 -0400 Subject: [PATCH 064/174] fixed some issues. --- .../local/chain/tuning/run_tdnn_wsj_rm_1a.sh | 5 ++-- egs/rm/s5/local/online/run_nnet2_common.sh | 4 ++-- egs/wsj/s5/steps/libs/nnet3/xconfig/utils.py | 8 ++++++- .../nnet3/chain/make_weighted_den_fst.sh | 24 ++++++++++--------- egs/wsj/s5/steps/nnet3/xconfig_to_configs.py | 5 ++-- src/nnet3/nnet-component-itf.h | 2 +- 6 files changed, 29 insertions(+), 19 deletions(-) diff --git a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1a.sh b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1a.sh index 2ede5364fed..9667dee7300 100755 --- a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1a.sh +++ b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1a.sh @@ -126,8 +126,9 @@ if [ $stage -le 6 ]; then fi if [ $stage -le 7 ]; then - echo "$0: creating neural net configs using the xconfig parser for"; - echo "extra layers w.r.t source network."; + echo "$0: Create neural net configs using the xconfig parser for"; + echo " generating new layers, that are specific to rm. These layers "; + echo " are added to the transferred part of the wsj network."; num_targets=$(tree-info --print-args=false $treedir/tree |grep num-pdfs|awk '{print $2}') learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) mkdir -p $dir diff --git a/egs/rm/s5/local/online/run_nnet2_common.sh b/egs/rm/s5/local/online/run_nnet2_common.sh index 86346339f62..f1f194fea26 100755 --- a/egs/rm/s5/local/online/run_nnet2_common.sh +++ b/egs/rm/s5/local/online/run_nnet2_common.sh @@ -1,6 +1,6 @@ #!/bin/bash -# This script extracts mfcc features using mfcc_config and train ubm model and -# ivector extractor and extract ivector for train and test. +# This script extracts mfcc features using mfcc_config and trains ubm model and +# ivector extractor and extracts ivector for train and test. . cmd.sh diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/utils.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/utils.py index 6b5cc97eaf9..9eae7287119 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/utils.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/utils.py @@ -14,7 +14,7 @@ # Given a list of objects of type XconfigLayerBase ('all_layers'), # including at least the layers preceding 'current_layer' (and maybe # more layers), return the names of layers preceding 'current_layer' -# other than layers of type 'existing', which correspond to component-node +# other than layers of type 'existing', which corresponds to component-node # names from an existing model that we are adding layers to them. # This will be used in parsing expressions like [-1] in descriptors # (which is an alias for the previous layer). @@ -23,6 +23,12 @@ def get_prev_names(all_layers, current_layer): for layer in all_layers: if layer is current_layer: break + + # The following if-statement is needed to handle the case where the + # the layer is an 'existing' layer, derived from an existing trained + # neural network supplied via the existing-model option, that we are + # adding layers to. In this case, these layers are not considered as + # layers preceding 'current_layer'. if layer.layer_type is not 'existing': prev_names.append(layer.get_name()) prev_names_set = set() diff --git a/egs/wsj/s5/steps/nnet3/chain/make_weighted_den_fst.sh b/egs/wsj/s5/steps/nnet3/chain/make_weighted_den_fst.sh index 947f6175469..e2cde40c6d3 100755 --- a/egs/wsj/s5/steps/nnet3/chain/make_weighted_den_fst.sh +++ b/egs/wsj/s5/steps/nnet3/chain/make_weighted_den_fst.sh @@ -31,8 +31,8 @@ cmd=run.pl stage=-10 weights= # comma-separated list of positive int valued scale weights used # to scale different phone sequences for different alignments. - # Scaling the count with i^th int weight 'w' is done by repeating - # the i^th phone sequence 'w' times. + # Scaling the n-gram count with i_th weight 'w' is equivalent to repeating + # the i_th phone sequence 'w' times in phone lm generation. # i.e. "1,10" # If not specified, weight '1' is used for all phone sequences. @@ -50,9 +50,9 @@ if [ $# -lt 2 ]; then echo " --cmd (run.pl|queue.pl...) # specify how to run the sub-processes."; echo "--lm-opts # options for phone LM generation"; echo "--weights # comma-separated list of positive int " - echo " # weights used to scale different phone sequences" - echo " # corresponding to different alignment " - echo " # in phone LM generation."; + echo " # weights used to scale phone sequences" + echo " # corresponding to input alignments " + echo " # used in phone LM generation."; exit 1; fi @@ -91,6 +91,7 @@ else fi if [ $stage -le 1 ]; then + rm $dir/all_phones.txt 2>/dev/null || true for n in `seq 0 $[num_alignments-1]`; do w=${w_arr[$n]} adir=${ali_dirs[$n]} @@ -98,16 +99,17 @@ if [ $stage -le 1 ]; then if ! [[ $w =~ ^[+]?[0-9]+$ ]] ; then echo "no positive int weight specified for alignment ${ali_dirs[$n]}" && exit 1; fi + + for j in `seq $num_jobs`;do gunzip -c $adir/ali.$j.gz; done \ + | ali-to-phones $adir/final.mdl ark:- "ark:|gzip -c >$dir/phones.$n.gz" || exit 1; for x in `seq $w`;do - for j in `seq $num_jobs`;do gunzip -c $adir/ali.$j.gz; done - done | ali-to-phones $adir/final.mdl ark:- "ark:|gzip -c >$dir/phones.$n.gz" + echo $dir/phones.$n.gz >> $dir/all_phones.txt + done done $cmd $dir/log/make_phone_lm_fst.log \ - for n in `seq 0 $[num_alignments-1]`\; do \ - gunzip -c $dir/phones.\$n.gz \; \ - done \| \ - chain-est-phone-lm $lm_opts ark:- $dir/phon_lm.fst || exit 1; + cat $dir/all_phones.txt \| while read f \; do gunzip -c \$f \; done \| \ + chain-est-phone-lm $lm_opts ark:- $dir/phone_lm.fst || exit 1; rm $dir/phones.*.gz 2>/dev/null || true fi diff --git a/egs/wsj/s5/steps/nnet3/xconfig_to_configs.py b/egs/wsj/s5/steps/nnet3/xconfig_to_configs.py index bbd4589620b..d74135e5980 100755 --- a/egs/wsj/s5/steps/nnet3/xconfig_to_configs.py +++ b/egs/wsj/s5/steps/nnet3/xconfig_to_configs.py @@ -35,9 +35,10 @@ def get_args(): 'already trained model ' 'to generate new config file for new model.' 'The context info is also generated using ' - 'final.config added to existing model.' + 'a model generated by adding final.config ' + 'to the existing model.' 'e.g. In Transfer learning: generate new model using ' - 'nodes in existing model.') + 'component nodes in existing model.') parser.add_argument('--config-dir', required=True, help='Directory to write config files and variables') parser.add_argument('--nnet-edits', type=str, default=None, diff --git a/src/nnet3/nnet-component-itf.h b/src/nnet3/nnet-component-itf.h index c4b4bcbc856..f30557f02bf 100644 --- a/src/nnet3/nnet-component-itf.h +++ b/src/nnet3/nnet-component-itf.h @@ -471,7 +471,7 @@ class UpdatableComponent: public Component { /// learning_rate_factor_. virtual void SetAsGradient() { learning_rate_ = 1.0; is_gradient_ = true; } - // Sets the learning rate factors to set to this value. + // Sets the learning rate factors to lrate_factor. virtual void SetLearningRateFactor(BaseFloat lrate_factor) { learning_rate_factor_ = lrate_factor; } From 3985924465d8568acd5965c0e930d5dcc1d889b3 Mon Sep 17 00:00:00 2001 From: Pegita Date: Tue, 12 Sep 2017 11:58:07 -0400 Subject: [PATCH 065/174] fixed some small issues. --- egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1a.sh | 6 +++--- egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1b.sh | 6 +++--- egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1c.sh | 6 +++--- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1a.sh b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1a.sh index 9667dee7300..89fd23374a5 100755 --- a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1a.sh +++ b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1a.sh @@ -59,7 +59,7 @@ fi required_files="$src_mfcc_config $src_mdl" use_ivector=false ivector_dim=$(nnet3-am-info --print-args=false $src_mdl | grep "ivector-dim" | cut -d" " -f2) -if [ "$ivector_dim" == "" ]; then ivector_dim=0 ; fi +if [ -z $ivector_dim ]; then ivector_dim=0 ; fi if [ ! -z $src_ivec_extractor_dir ]; then if [ $ivector_dim -eq 0 ]; then @@ -72,13 +72,13 @@ if [ ! -z $src_ivec_extractor_dir ]; then else if [ $ivector_dim -gt 0 ]; then echo "$0: ivector is used in training the source model '$src_mdl' but no " - echo "ivector extractor dir for source model is specified." && exit 1; + echo " --src-ivec-extractor-dir option as ivector dir for source model is specified." && exit 1; fi fi for f in $required_files; do if [ ! -f $f ]; then - echo "$0: no such file $f." + echo "$0: no such file $f." && exit 1; fi done diff --git a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1b.sh b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1b.sh index 070337c4546..85d0d630850 100755 --- a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1b.sh +++ b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1b.sh @@ -24,7 +24,7 @@ set -e # configs for 'chain' -stage=7 +stage=0 train_stage=-4 get_egs_stage=-10 tdnn_affix=_1b @@ -100,7 +100,7 @@ required_files="$src_mfcc_config $src_mdl $src_lang/phones.txt $src_dict/lexicon use_ivector=false ivector_dim=$(nnet3-am-info --print-args=false $src_mdl | grep "ivector-dim" | cut -d" " -f2) -if [ "$ivector_dim" == "" ]; then ivector_dim=0 ; fi +if [ -z $ivector_dim ]; then ivector_dim=0 ; fi if [ ! -z $src_ivec_extractor_dir ]; then if [ $ivector_dim -eq 0 ]; then @@ -125,7 +125,7 @@ for f in $required_files; do done if [ $stage -le -1 ]; then - echo "$0: prepare lexicon.txt for RM using WSJ lexicon." + echo "$0: Prepare lang for RM-WSJ using WSJ phone set and lexicon and RM word list." if ! cmp -s <(grep -v "^#" $src_lang/phones.txt) <(grep -v "^#" data/lang/phones.txt); then local/prepare_wsj_rm_lang.sh $src_dict $src_lang $lang_src_tgt else diff --git a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1c.sh b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1c.sh index 38ee53bf19f..a752762e83e 100755 --- a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1c.sh +++ b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1c.sh @@ -27,7 +27,7 @@ set -e # configs for 'chain' -stage=8 +stage=0 train_stage=-4 get_egs_stage=-10 dir=exp/chain/tdnn_wsj_rm_1c @@ -97,7 +97,7 @@ required_files="$src_mfcc_config $src_mdl $src_lang/phones.txt $src_dict/lexicon use_ivector=false ivector_dim=$(nnet3-am-info --print-args=false $src_mdl | grep "ivector-dim" | cut -d" " -f2) -if [ "$ivector_dim" == "" ]; then ivector_dim=0 ; fi +if [ -z $ivector_dim ]; then ivector_dim=0 ; fi if [ ! -z $src_ivec_extractor_dir ]; then if [ $ivector_dim -eq 0 ]; then @@ -122,7 +122,7 @@ for f in $required_files; do done if [ $stage -le -1 ]; then - echo "$0: prepare lang for RM-WSJ using WSJ phone set and lexicon and RM word list." + echo "$0: Prepare lang for RM-WSJ using WSJ phone set and lexicon and RM word list." if ! cmp -s <(grep -v "^#" $src_lang/phones.txt) <(grep -v "^#" $lang_dir/phones.txt); then local/prepare_wsj_rm_lang.sh $src_dict $src_lang $lang_src_tgt || exit 1; else From fe07c0bf066d44190676ece77fde514353de7a21 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Wed, 13 Sep 2017 00:15:59 -0400 Subject: [PATCH 066/174] [scripts] Cosmetic and other improvements to make_weighted_den_fst.sh script --- .../local/chain/tuning/run_tdnn_wsj_rm_1b.sh | 10 +-- .../local/chain/tuning/run_tdnn_wsj_rm_1c.sh | 8 +- .../nnet3/chain/make_weighted_den_fst.sh | 90 +++++++++---------- 3 files changed, 54 insertions(+), 54 deletions(-) diff --git a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1b.sh b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1b.sh index 85d0d630850..3e8d5717d4b 100755 --- a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1b.sh +++ b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1b.sh @@ -10,7 +10,7 @@ # Since we use phone.txt from source dataset, this can be helpful in cases # where there is few training data in the target domain and some 4-gram phone # sequences have no count in the target domain. -# 4) It uses whole already-trained model and does not replace the output layer +# 4) It uses whole already-trained model and does not replace the output layer # from already-trained model with new randomely initialized output layer and # re-train it using target dataset. @@ -35,9 +35,9 @@ primary_lr_factor=0.25 # The learning-rate factor for transferred layers from so # model. e.g. if 0, it fixed the paramters transferred from source. # The learning-rate factor for new added layers is 1.0. nnet_affix=_online_wsj -phone_lm_scales="1,10" # comma-separated list of positive int valued scale weights - # to scale different phone sequences for different alignments - # e.g. (src-weight,target-weight)=(1,10) +phone_lm_scales="1,10" # comma-separated list of positive integer multiplicities + # to apply to the different source data directories (used + # to give the RM data a higher weight). # model and dirs for source model used for transfer learning src_mdl=../../wsj/s5/exp/chain/tdnn1d_sp/final.mdl # Input chain model @@ -161,7 +161,7 @@ fi if [ $stage -le 6 ]; then echo "$0: compute {den,normalization}.fst using weighted phone LM with wsj and rm weight $phone_lm_scales." steps/nnet3/chain/make_weighted_den_fst.sh --cmd "$train_cmd" \ - --weights $phone_lm_scales \ + --num-repeats $phone_lm_scales \ --lm-opts '--num-extra-lm-states=200' \ $src_tree_dir $lat_dir $dir || exit 1; fi diff --git a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1c.sh b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1c.sh index a752762e83e..611aede371d 100755 --- a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1c.sh +++ b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1c.sh @@ -38,9 +38,9 @@ common_egs_dir= primary_lr_factor=0.25 # learning-rate factor for all except last layer in transferred source model nnet_affix=_online_wsj -phone_lm_scales="1,10" # comma-separated list of int valued scale weights - # to scale different phone sequences for different alignments - # e.g. (src-weight,target-weight)=(10,1) +phone_lm_scales="1,10" # comma-separated list of positive integer multiplicities + # to apply to the different source data directories (used + # to give the RM data a higher weight). # model and dirs for source model used for transfer learning src_mdl=../../wsj/s5/exp/chain/tdnn1d_sp/final.mdl # input chain model @@ -163,7 +163,7 @@ fi if [ $stage -le 6 ]; then echo "$0: compute {den,normalization}.fst using weighted phone LM." steps/nnet3/chain/make_weighted_den_fst.sh --cmd "$train_cmd" \ - --weights $phone_lm_scales \ + --num-repeats $phone_lm_scales \ --lm-opts '--num-extra-lm-states=200' \ $src_tree_dir $lat_dir $dir || exit 1; fi diff --git a/egs/wsj/s5/steps/nnet3/chain/make_weighted_den_fst.sh b/egs/wsj/s5/steps/nnet3/chain/make_weighted_den_fst.sh index e2cde40c6d3..493353c675b 100755 --- a/egs/wsj/s5/steps/nnet3/chain/make_weighted_den_fst.sh +++ b/egs/wsj/s5/steps/nnet3/chain/make_weighted_den_fst.sh @@ -20,21 +20,19 @@ # first alignment directory to the chain directory. # This script can accept multiple sources of alignments with same phone sets # that can be weighted to estimate phone LM. -# 'weights' is comma-separated list of positive int values used -# to scale different phone sequences for different alignments. -# Each alignment directory should contain tree, final.mdl and ali.*.gz. +# You can use the --num-repeats option to repeat some source data more than +# once when training the LM for the denominator FST. set -o pipefail # begin configuration section. cmd=run.pl -stage=-10 -weights= # comma-separated list of positive int valued scale weights used - # to scale different phone sequences for different alignments. - # Scaling the n-gram count with i_th weight 'w' is equivalent to repeating - # the i_th phone sequence 'w' times in phone lm generation. - # i.e. "1,10" - # If not specified, weight '1' is used for all phone sequences. +stage=0 +num_repeats= # Comma-separated list of positive integer multiplicities, one + # for each input alignment directory. The alignments from + # each source will be scaled by the corresponding value when + # training the LM. + # If not specified, weight '1' is used for all data sources. lm_opts='--num-extra-lm-states=2000' #end configuration section. @@ -47,28 +45,28 @@ if [ $# -lt 2 ]; then echo "Usage: $0 [options] [ ...] "; echo "e.g.: $0 exp/tri1_ali exp/tri2_ali exp/chain/tdnn_1a_sp"; echo "Options: " - echo " --cmd (run.pl|queue.pl...) # specify how to run the sub-processes."; - echo "--lm-opts # options for phone LM generation"; - echo "--weights # comma-separated list of positive int " - echo " # weights used to scale phone sequences" - echo " # corresponding to input alignments " - echo " # used in phone LM generation."; + echo " --cmd (run.pl|queue.pl...) # Specify how to run jobs."; + echo "--lm-opts # Options for phone LM generation"; + echo "--num-repeats # Comma-separated list of postive integer" + echo " # multiplicities, one for each input" + echo " # alignment directory. The alignments" + echo " # from each source will be scaled by" + echo " # the corresponding value when training" + echo " # the LM. If not specified, weight '1'" + echo " # is used for all data sources." exit 1; fi -dir=${@: -1} # last argument to the script +dir=${@: -1} # the working directory: last argument to the script ali_dirs=( $@ ) # read the remaining arguments into an array -unset ali_dirs[${#ali_dirs[@]}-1] # 'pop' the last argument which is odir -num_alignments=${#ali_dirs[@]} # number of alignment dirs to combine +unset ali_dirs[${#ali_dirs[@]}-1] # 'pop' the last argument which is $dir +num_alignments=${#ali_dirs[@]} # number of alignment dirs to combine mkdir -p $dir/log for n in `seq 0 $[$num_alignments-1]`;do ali_dir=${ali_dirs[$n]} for f in $ali_dir/ali.1.gz $ali_dir/final.mdl $ali_dir/tree; do - if [ ! -f $f ]; then - echo "$0: Could not find file $f" - exit 1 - fi + [ ! -f $f ] && echo "$0: Expected file $f to exist" && exit 1; done utils/lang/check_phones_compatible.sh ${ali_dirs[0]}/phones.txt \ ${ali_dirs[$n]}/phones.txt || exit 1; @@ -76,41 +74,43 @@ done cp ${ali_dirs[0]}/tree $dir/ || exit 1 -if [ -z $weights ]; then - # If 'weights' is not specified, comma-separated array '1' with dim - #'num_alignments' is defined as 'weights'. - w_arr=() - for n in `seq 1 $num_alignments`;do w_arr+=(1); done +if [ -z "$num_repeats" ]; then + # If 'num_repeats' is not specified, set num_repeats_array to e.g. (1 1 1). + num_repeats_array=( $(for n in $(seq $num_alignments); do echo 1; done) ) else - w_arr=(${weights//,/ }) - num_weights=${#w_arr[@]} - if [ $num_alignments -ne $num_weights ]; then - echo "$0: number of weights in $weight, $num_weights, should be equal to the " - echo "number of alignment directories, $num_alignments." && exit 1; + num_repeats_array=(${num_repeats//,/ }) + num_repeats=${#num_repeats_array[@]} + if [ $num_repeats -ne $num_alignments ]; then + echo "$0: too many or too few elements in --num-repeats option: '$num_repeats'" + exit 1 fi fi if [ $stage -le 1 ]; then - rm $dir/all_phones.txt 2>/dev/null || true + all_phones="" # will contain the names of the .gz files containing phones, + # with some members possibly repeated per the --num-repeats + # option for n in `seq 0 $[num_alignments-1]`; do - w=${w_arr[$n]} - adir=${ali_dirs[$n]} - num_jobs=$(cat $adir/num_jobs) - if ! [[ $w =~ ^[+]?[0-9]+$ ]] ; then - echo "no positive int weight specified for alignment ${ali_dirs[$n]}" && exit 1; + this_num_repeats=${num_repeats_array[$n]} + this_alignment_adir=${ali_dirs[$n]} + num_jobs=$(cat $this_alignment_adir/num_jobs) + if ! [ "$this_num_repeats" -gt 0 ]; then + echo "Expected comma-separated list of integers for --num-repeats option, got '$num_repeats'" + exit 1 fi - for j in `seq $num_jobs`;do gunzip -c $adir/ali.$j.gz; done \ - | ali-to-phones $adir/final.mdl ark:- "ark:|gzip -c >$dir/phones.$n.gz" || exit 1; - for x in `seq $w`;do - echo $dir/phones.$n.gz >> $dir/all_phones.txt + + for j in $(seq $num_jobs); do gunzip -c $this_alignment_adir/ali.$j.gz; done | \ + ali-to-phones $this_alignment_dir/final.mdl ark:- "ark:|gzip -c >$dir/phones.$n.gz" || exit 1; + + all_phones="$all_phones $(for r in $(seq $num_repeats); do echo $dir/phones.$n.gz; done)" done done $cmd $dir/log/make_phone_lm_fst.log \ - cat $dir/all_phones.txt \| while read f \; do gunzip -c \$f \; done \| \ + gunzip -c $all_phones \| \ chain-est-phone-lm $lm_opts ark:- $dir/phone_lm.fst || exit 1; - rm $dir/phones.*.gz 2>/dev/null || true + rm $dir/phones.*.gz fi if [ $stage -le 2 ]; then From b5ce6473343d522de37b570e1cb59e7ebbd24bda Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Wed, 13 Sep 2017 13:53:23 -0400 Subject: [PATCH 067/174] smbr: Logging bug fix --- egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py index cbcb646c6b8..c4371cb636a 100644 --- a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py +++ b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py @@ -315,7 +315,7 @@ def train_one_iteration(dir, iter, srand, egs_dir, if shrinkage_value != 1.0: shrink_info_str = ' and shrink value is {0}'.format(shrinkage_value) - objf_info = "" if smbr_opt != "" else ( + objf_info = "" if smbr_opt == "" else ( "and objective is sMBR and smbr_opt=" + smbr_opt) logger.info("On iteration {0}, learning rate is {1}" "{shrink_info} {objf_info}.".format( From 967531d98513594063c3a19a4c2a1fc0900297b5 Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Wed, 13 Sep 2017 13:53:47 -0400 Subject: [PATCH 068/174] semisup: Extend trivial output layer --- .../steps/libs/nnet3/xconfig/basic_layers.py | 58 +++++++++++-------- 1 file changed, 35 insertions(+), 23 deletions(-) diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py index ae61214160f..455531b1f63 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py @@ -42,13 +42,13 @@ def __init__(self, first_token, key_to_value, all_layers): raise RuntimeError("Invalid value: name={0}".format( key_to_value['name'])) - # It is possible to have two layers with a same name in 'all_layer', if + # It is possible to have two layers with a same name in 'all_layer', if # the layer type for one of them is 'existing'. # Layers of type 'existing' are corresponding to the component-node names # in the existing model, which we are adding layers to them. # 'existing' layers are not presented in any config file, and new layer # with the same name can exist in 'all_layers'. - # e.g. It is possible to have 'output-node' with name 'output' in the + # e.g. It is possible to have 'output-node' with name 'output' in the # existing model, which is added to all_layers using layer type 'existing', # and 'output-node' of type 'output-layer' with the same name 'output' in # 'all_layers'. @@ -379,7 +379,7 @@ def set_default_configs(self): # note: self.config['input'] is a descriptor, '[-1]' means output # the most recent layer. - self.config = {'input': '[-1]', 'dim': -1} + self.config = {'input': '[-1]', 'dim': -1, 'skip-in-init': False} def check_configs(self): @@ -416,6 +416,8 @@ def get_full_config(self): descriptor_final_str = self.descriptors['input']['final-string'] for config_name in ['init', 'ref', 'final']: + if config_name == 'init' and self.config['skip-in-init']: + continue ans.append((config_name, 'output-node name={0} input={1}'.format( self.name, descriptor_final_str))) @@ -507,28 +509,38 @@ def check_configs(self): " invalid value {0}" "".format(self.config['learning-rate-factor'])) - # you cannot access the output of this layer from other layers... see - # comment in output_name for the reason why. def auxiliary_outputs(self): - return [] + auxiliary_outputs = ['affine'] + if self.config['include-log-softmax']: + auxiliary_outputs.append('log-softmax') - def output_name(self, auxiliary_outputs=None): + return auxiliary_outputs + + def output_name(self, auxiliary_output=None): + + if auxiliary_output is None: + # Note: nodes of type output-node in nnet3 may not be accessed in + # Descriptors, so calling this with auxiliary_outputs=None doesn't + # make sense. + raise RuntimeError("Outputs of output-layer may not be used by other" + " layers") - # Note: nodes of type output-node in nnet3 may not be accessed in - # Descriptors, so calling this with auxiliary_outputs=None doesn't - # make sense. But it might make sense to make the output of the softmax - # layer and/or the output of the affine layer available as inputs to - # other layers, in some circumstances. - # we'll implement that when it's needed. - raise RuntimeError("Outputs of output-layer may not be used by other" - " layers") + if auxiliary_output in self.auxiliary_outputs(): + return '{0}.{1}'.format(self.name, auxiliary_output) + else: + raise RuntimeError("Unknown auxiliary output name {0}" + "".format(auxiliary_output)) def output_dim(self, auxiliary_output=None): - # see comment in output_name(). - raise RuntimeError("Outputs of output-layer may not be used by other" - " layers") + if auxiliary_output is None: + # Note: nodes of type output-node in nnet3 may not be accessed in + # Descriptors, so calling this with auxiliary_outputs=None doesn't + # make sense. + raise RuntimeError("Outputs of output-layer may not be used by other" + " layers") + return self.config['dim'] def get_full_config(self): @@ -1092,19 +1104,19 @@ def get_full_config(self): class XconfigExistingLayer(XconfigLayerBase): """ - This class is for lines like + This class is for lines like 'existing name=tdnn1.affine dim=40'. - - This layer contains 'dim' and 'name' and it is not presented in + + This layer contains 'dim' and 'name' and it is not presented in any actual config files. Layers of this type are created internally for all component nodes in an existing neural net model to use as input to other layers. (i.e. get_model_component_info function, which is called in steps/nnet3/xconfig_to_configs.py, returns a list of 'existing' layers for component nodes used in 'existing_model') - + This class is useful in cases like transferring existing model - and using {input, output, component}-nodes in this model as + and using {input, output, component}-nodes in this model as input to new layers. """ From e5e57eed905c060709778557ef5bf4f450e85762 Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Wed, 13 Sep 2017 13:54:06 -0400 Subject: [PATCH 069/174] temp fix --- egs/wsj/s5/steps/nnet3/chain/make_weighted_den_fst.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/egs/wsj/s5/steps/nnet3/chain/make_weighted_den_fst.sh b/egs/wsj/s5/steps/nnet3/chain/make_weighted_den_fst.sh index 5e48acb914b..8d989f190e7 100755 --- a/egs/wsj/s5/steps/nnet3/chain/make_weighted_den_fst.sh +++ b/egs/wsj/s5/steps/nnet3/chain/make_weighted_den_fst.sh @@ -36,7 +36,7 @@ weights= # comma-separated list of positive int valued scale weights used # i.e. "1,10" # If not specified, weight '1' is used for all phone sequences. -lm_opts='num_extra_lm_state=2000' +lm_opts='--num-extra-lm-states=2000' #end configuration section. From d61cb4b0f7bc1f48ea5951472cb388076554397f Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Sun, 24 Sep 2017 22:10:41 -0400 Subject: [PATCH 070/174] semisup: Adding lattice splitting chain code --- .../run_tdnn_11k_semisupervised_conf_e.sh | 21 +- .../run_tdnn_11k_semisupervised_conf_h.sh | 2 +- .../run_tdnn_11k_semisupervised_conf_l.sh | 21 +- .../semisup/chain/tuning/run_tdnn_15k_c.sh | 193 +++++++ egs/fisher_english/s5/path.sh | 2 +- egs/wsj/s5/steps/lmrescore_const_arpa.sh | 5 +- egs/wsj/s5/steps/nnet3/chain/get_egs_split.sh | 507 ++++++++++++++++++ .../nnet3/chain/make_weighted_den_fst.sh | 9 +- egs/wsj/s5/steps/nnet3/decode.sh | 15 +- egs/wsj/s5/steps/nnet3/get_degs.sh | 2 +- src/chain/Makefile | 4 +- src/chain/chain-supervision-splitter-test.cc | 228 ++++++++ src/chain/chain-supervision-splitter.cc | 481 +++++++++++++++++ src/chain/chain-supervision-splitter.h | 121 +++++ src/chain/chain-supervision.cc | 5 +- src/chainbin/Makefile | 4 +- src/chainbin/chain-split-lattices.cc | 185 +++++++ src/hmm/hmm-test-utils.cc | 4 +- src/hmm/hmm-test-utils.h | 2 +- src/lat/lattice-functions.cc | 116 +++- src/lat/lattice-functions.h | 12 +- src/latbin/Makefile | 2 +- src/latbin/lattice-align-phones.cc | 69 ++- ...ce-determinize-phone-pruned-non-compact.cc | 139 +++++ .../lattice-determinize-pruned-non-compact.cc | 102 ---- src/latbin/lattice-lmrescore-const-arpa.cc | 58 +- src/latbin/lattice-lmrescore.cc | 41 +- src/latbin/lattice-prune.cc | 52 +- src/latbin/lattice-scale.cc | 41 +- src/latbin/lattice-to-fst.cc | 130 ++++- src/latbin/lattice-to-phone-lattice.cc | 51 +- 31 files changed, 2398 insertions(+), 226 deletions(-) create mode 100755 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_c.sh create mode 100755 egs/wsj/s5/steps/nnet3/chain/get_egs_split.sh create mode 100644 src/chain/chain-supervision-splitter-test.cc create mode 100644 src/chain/chain-supervision-splitter.cc create mode 100644 src/chain/chain-supervision-splitter.h create mode 100644 src/chainbin/chain-split-lattices.cc create mode 100644 src/latbin/lattice-determinize-phone-pruned-non-compact.cc diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_e.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_e.sh index b8caaa53dea..9f6d3a23b8a 100644 --- a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_e.sh +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_e.sh @@ -23,6 +23,8 @@ semi_affix=semi11k_250k # affix relating train-set splitting proportion tdnn_affix=7b # affix for the supervised chain-model directory train_supervised_opts="--stage -10 --train-stage -10" +lm_opts= + # Unsupervised options decode_affix= egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir @@ -176,7 +178,7 @@ fi dir=$exp/chain${nnet3_affix}/tdnn${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} if [ $stage -le 10 ]; then - steps/nnet3/chain/make_den_fst.sh --weights $lm_weights --cmd "$train_cmd" \ + steps/nnet3/chain/make_weighted_den_fst.sh --weights $lm_weights --cmd "$train_cmd" --lm-opts "$lm_opts" \ ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ $dir fi @@ -207,8 +209,6 @@ if [ $stage -le 11 ]; then ## adding the layers for chain branch relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 - output-layer name=output-0 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 - output-layer name=output-1 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 # adding the layers for xent branch @@ -221,17 +221,16 @@ if [ $stage -le 11 ]; then # constant; and the 0.5 was tuned so as to make the relative progress # similar in the xent and regular final layers. relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 - output-layer name=output-0-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 - output-layer name=output-1-xent input=prefinal-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true EOF steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ - cp $dir/configs/final.config{,.orig} - - cat $dir/configs/final.config.orig | \ - perl -pe 's/component=output-1.affine/component=output-0.affine/g; - s/component=output-1-xent.affine/component=output-0-xent.affine/g;' > \ - $dir/configs/final.config fi . $dir/configs/vars diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_h.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_h.sh index 492f992c6f0..c3f1cabc81a 100644 --- a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_h.sh +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_h.sh @@ -174,7 +174,7 @@ fi dir=$exp/chain${nnet3_affix}/tdnn${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} if [ $stage -le 10 ]; then - steps/nnet3/chain/make_den_fst.sh --weights $lm_weights --cmd "$train_cmd" \ + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ $dir fi diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_l.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_l.sh index c2772704943..7e5d41887cd 100644 --- a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_l.sh +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_l.sh @@ -53,6 +53,7 @@ minibatch_size=128 decode_iter= +do_finetuning=false finetune_stage=-2 finetune_suffix=_finetune finetune_iter=final @@ -384,14 +385,22 @@ if [ $stage -le 15 ]; then --dir $dir || exit 1; fi -graph_dir=$dir/graph -if [ $stage -le 17 ]; then +graph_dir=$dir/graph_unk +if [ $stage -le 17 ] && [ ! -f $graph_dir/HCLG.fst ]; then + if [ ! -f data/lang_test_unk/L_disambig.fst ]; then + utils/prepare_lang.sh --unk-fst exp/unk_lang_model/unk_fst.txt \ + data/local/dict "$(cat data/lang/oov.txt)" data/local/lm data/lang_test_unk + + cp data/lang_test/G.fst data/lang_test_unk/ + fi + # Note: it might appear that this $lang directory is mismatched, and it is as # far as the 'topo' is concerned, but this script doesn't read the 'topo' from # the lang directory. - utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test_unk $dir $graph_dir fi +decode_suffix=_unk if [ $stage -le 18 ]; then iter_opts= if [ ! -z $decode_iter ]; then @@ -410,11 +419,15 @@ if [ $stage -le 18 ]; then steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ - $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + $graph_dir data/${decode_set}_hires $dir/decode${decode_suffix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; ) & done fi +if ! $do_finetuning; then + wait; exit 0; +fi + if [ $stage -le 19 ]; then mkdir -p ${dir}${finetune_suffix} diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_c.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_c.sh new file mode 100755 index 00000000000..3d09d9ee4ab --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_c.sh @@ -0,0 +1,193 @@ +#!/bin/bash +set -e + +# This is fisher chain recipe for training a model on a subset of around 10 hours. + +# configs for 'chain' +stage=0 +tdnn_affix=7c +train_stage=-10 +get_egs_stage=-10 +decode_iter= +train_set=train_sup15k +ivector_train_set=semisup15k_250k +tree_affix= +nnet3_affix=_semi15k_250k +chain_affix=_semi15k_250k +exp=exp/semisup_15k +gmm=tri3 +xent_regularize=0.1 +hidden_dim=500 + +# training options +num_epochs=10 +remove_egs=false +common_egs_dir= +minibatch_size=128 + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate -1 \ + --cmd "$train_cmd" 4000 data/${train_set} $lang $gmm_dir $treedir || exit 1 +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn5 dim=$hidden_dim target-rms=0.5 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn5 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$common_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize 0.1 \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true" \ + --egs.chunk-width 160,140,110,80 \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +decode_suff= +if [ $stage -le 15 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}${decode_suff} || exit 1; + ) & + done +fi +wait; +exit 0; diff --git a/egs/fisher_english/s5/path.sh b/egs/fisher_english/s5/path.sh index 0b0b53d8e5d..7cad3842ab3 100755 --- a/egs/fisher_english/s5/path.sh +++ b/egs/fisher_english/s5/path.sh @@ -2,6 +2,6 @@ export KALDI_ROOT=`pwd`/../../.. export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH [ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 . $KALDI_ROOT/tools/config/common_path.sh -export PYTHONPATH=$PYTHONPATH:$KALDI_ROOT/tools/tensorflow_build/.local/lib/python2.7/site-packages +export PYTHONPATH=${PYTHONPATH:+$PYTHONPATH:}$KALDI_ROOT/tools/tensorflow_build/.local/lib/python2.7/site-packages export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$KALDI_ROOT/tools/tensorflow/bazel-bin/tensorflow/:/usr/local/cuda/lib64:/export/a11/hlyu/cudnn/lib64:/home/dpovey/libs/ export LC_ALL=C diff --git a/egs/wsj/s5/steps/lmrescore_const_arpa.sh b/egs/wsj/s5/steps/lmrescore_const_arpa.sh index 796ff5fc95c..97f8379c7df 100755 --- a/egs/wsj/s5/steps/lmrescore_const_arpa.sh +++ b/egs/wsj/s5/steps/lmrescore_const_arpa.sh @@ -10,6 +10,7 @@ cmd=run.pl skip_scoring=false stage=1 scoring_opts= +write_compact=true # End configuration section. echo "$0 $@" # Print the command line for logging @@ -53,9 +54,9 @@ cp $indir/num_jobs $outdir if [ $stage -le 1 ]; then $cmd JOB=1:$nj $outdir/log/rescorelm.JOB.log \ - lattice-lmrescore --lm-scale=-1.0 \ + lattice-lmrescore --lm-scale=-1.0 --write-compact=$write_compact \ "ark:gunzip -c $indir/lat.JOB.gz|" "$oldlmcommand" ark:- \| \ - lattice-lmrescore-const-arpa --lm-scale=1.0 \ + lattice-lmrescore-const-arpa --lm-scale=1.0 --write-compact=$write_compact \ ark:- "$newlm" "ark,t:|gzip -c>$outdir/lat.JOB.gz" || exit 1; fi diff --git a/egs/wsj/s5/steps/nnet3/chain/get_egs_split.sh b/egs/wsj/s5/steps/nnet3/chain/get_egs_split.sh new file mode 100755 index 00000000000..c4baf3c4ea1 --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/chain/get_egs_split.sh @@ -0,0 +1,507 @@ +#!/bin/bash + +# Copyright 2012-2015 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. +# +# This script, which will generally be called from other neural-net training +# scripts, extracts the training examples used to train the 'chain' system +# (and also the validation examples used for diagnostics), and puts them in +# separate archives. +# +# This script dumps egs with many frames of labels, controlled by the +# frames_per_eg config variable (default: 25), plus left and right context. +# Because CTC training involves alignment of data, we can't meaningfully train +# frame by frame. The supervision approach involves the time alignment, though-- +# it is just applied in a loose way, where each symbol can appear in the +# frame-range that it was in in the alignment, extended by a certain margin. +# + + +# Begin configuration section. +cmd=run.pl +frames_per_eg=25 # number of feature frames example (not counting added context). + # more->less disk space and less time preparing egs, but more + # I/O during training. +frames_overlap_per_eg=0 # number of supervised frames of overlap that we aim for per eg. + # can be useful to avoid wasted data if you're using --left-deriv-truncate + # and --right-deriv-truncate. +frame_subsampling_factor=3 # frames-per-second of features we train on divided + # by frames-per-second at output of chain model +alignment_subsampling_factor=3 # frames-per-second of input alignments divided + # by frames-per-second at output of chain model +left_context=4 # amount of left-context per eg (i.e. extra frames of input features + # not present in the output supervision). +right_context=4 # amount of right-context per eg. +left_context_initial=-1 # if >=0, left-context for first chunk of an utterance +right_context_final=-1 # if >=0, right-context for last chunk of an utterance +compress=true # set this to false to disable compression (e.g. if you want to see whether + # results are affected). + +num_utts_subset=300 # number of utterances in validation and training + # subsets used for shrinkage and diagnostics. +num_valid_egs_combine=0 # #validation examples for combination weights at the very end. +num_train_egs_combine=1000 # number of train examples for the above. +num_egs_diagnostic=400 # number of frames for "compute_prob" jobs +frames_per_iter=400000 # each iteration of training, see this many frames per + # job, measured at the sampling rate of the features + # used. This is just a guideline; it will pick a number + # that divides the number of samples in the entire data. + +right_tolerance= #CTC right tolerance == max label delay. +left_tolerance= + +right_tolerance_silence= # Tolerances for silence phones +left_tolerance_silence= + +transform_dir= # If supplied, overrides latdir as the place to find fMLLR transforms + +stage=0 +nj=15 # This should be set to the maximum number of jobs you are + # comfortable to run in parallel; you can increase it if your disk + # speed is greater and you have more machines. +max_shuffle_jobs_run=50 # the shuffle jobs now include the nnet3-chain-normalize-egs command, + # which is fairly CPU intensive, so we can run quite a few at once + # without overloading the disks. +srand=0 # rand seed for nnet3-chain-get-egs, nnet3-chain-copy-egs and nnet3-chain-shuffle-egs +online_ivector_dir= # can be used if we are including speaker information as iVectors. +cmvn_opts= # can be used for specifying CMVN options, if feature type is not lda (if lda, + # it doesn't make sense to use different options than were used as input to the + # LDA transform). This is used to turn off CMVN in the online-nnet experiments. +lattice_lm_scale= # If supplied, the graph/lm weight of the lattices will be + # used (with this scale) in generating supervisions +egs_weight=1.0 # The weight which determines how much each training example + # contributes to gradients while training (can be used + # to down/up-weight a dataset) +lattice_prune_beam= # If supplied, the lattices will be pruned to this beam, + # before being used to get supervisions. +acwt=0.1 # For pruning +phone_insertion_penalty= +deriv_weights_scp= +generate_egs_scp=false + +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + + +if [ $# != 4 ]; then + echo "Usage: $0 [opts] " + echo " e.g.: $0 data/train exp/tri4_nnet exp/tri3_lats exp/tri4_nnet/egs" + echo "" + echo "From , 0.trans_mdl (the transition-model), tree (the tree)" + echo "and normalization.fst (the normalization FST, derived from the denominator FST)" + echo "are read." + echo "" + echo "Main options (for others, see top of script file)" + echo " --config # config file containing options" + echo " --nj # The maximum number of jobs you want to run in" + echo " # parallel (increase this only if you have good disk and" + echo " # network speed). default=6" + echo " --cmd (utils/run.pl;utils/queue.pl ) # how to run jobs." + echo " --frames-per-iter <#samples;400000> # Number of frames of data to process per iteration, per" + echo " # process." + echo " --frame-subsampling-factor # factor by which num-frames at nnet output is reduced " + echo " --frames-per-eg # number of supervised frames per eg on disk" + echo " --frames-overlap-per-eg # number of supervised frames of overlap between egs" + echo " --left-context # Number of frames on left side to append for feature input" + echo " --right-context # Number of frames on right side to append for feature input" + echo " --left-context-initial # If >= 0, left-context for first chunk of an utterance" + echo " --right-context-final # If >= 0, right-context for last chunk of an utterance" + echo " --num-egs-diagnostic <#frames;4000> # Number of egs used in computing (train,valid) diagnostics" + echo " --num-valid-egs-combine <#frames;10000> # Number of egss used in getting combination weights at the" + echo " # very end." + echo " --stage # Used to run a partially-completed training process from somewhere in" + echo " # the middle." + + exit 1; +fi + +data=$1 +chaindir=$2 +latdir=$3 +dir=$4 + +# Check some files. +[ ! -z "$online_ivector_dir" ] && \ + extra_files="$online_ivector_dir/ivector_online.scp $online_ivector_dir/ivector_period" + +for f in $data/feats.scp $latdir/lat.1.gz $latdir/final.mdl \ + $chaindir/{0.trans_mdl,tree,normalization.fst} $extra_files; do + [ ! -f $f ] && echo "$0: no such file $f" && exit 1; +done + +sdata=$data/split$nj +utils/split_data.sh $data $nj + +mkdir -p $dir/log $dir/info + +num_lat_jobs=$(cat $latdir/num_jobs) || exit 1; + +# Get list of validation utterances. + +frame_shift=$(utils/data/get_frame_shift.sh $data) || exit 1 +utils/data/get_utt2dur.sh $data + +cat $data/utt2dur | \ + awk -v min_len=$frames_per_eg -v fs=$frame_shift '{if ($2 * 1/fs >= min_len) print $1}' | \ + utils/shuffle_list.pl | head -$num_utts_subset > $dir/valid_uttlist || exit 1; + +len_uttlist=`wc -l $dir/valid_uttlist | awk '{print $1}'` +if [ $len_uttlist -lt $num_utts_subset ]; then + echo "Number of utterances which have length at least $frames_per_eg is really low. Please check your data." && exit 1; +fi + +if [ -f $data/utt2uniq ]; then # this matters if you use data augmentation. + # because of this stage we can again have utts with lengths less than + # frames_per_eg + echo "File $data/utt2uniq exists, so augmenting valid_uttlist to" + echo "include all perturbed versions of the same 'real' utterances." + mv $dir/valid_uttlist $dir/valid_uttlist.tmp + utils/utt2spk_to_spk2utt.pl $data/utt2uniq > $dir/uniq2utt + cat $dir/valid_uttlist.tmp | utils/apply_map.pl $data/utt2uniq | \ + sort | uniq | utils/apply_map.pl $dir/uniq2utt | \ + awk '{for(n=1;n<=NF;n++) print $n;}' | sort > $dir/valid_uttlist + rm $dir/uniq2utt $dir/valid_uttlist.tmp +fi + +cat $data/utt2dur | \ + awk -v min_len=$frames_per_eg -v fs=$frame_shift '{if ($2 * 1/fs >= min_len) print $1}' | \ + utils/filter_scp.pl --exclude $dir/valid_uttlist | \ + utils/shuffle_list.pl | head -$num_utts_subset > $dir/train_subset_uttlist || exit 1; +len_uttlist=`wc -l $dir/train_subset_uttlist | awk '{print $1}'` +if [ $len_uttlist -lt $num_utts_subset ]; then + echo "Number of utterances which have length at least $frames_per_eg is really low. Please check your data." && exit 1; +fi + +[ -z "$transform_dir" ] && transform_dir=$latdir + +# because we'll need the features with a different number of jobs than $latdir, +# copy to ark,scp. +if [ -f $transform_dir/raw_trans.1 ]; then + echo "$0: using raw transforms from $transform_dir" + if [ $stage -le 0 ]; then + $cmd $dir/log/copy_transforms.log \ + copy-feats "ark:cat $transform_dir/raw_trans.* |" "ark,scp:$dir/trans.ark,$dir/trans.scp" + fi +fi + +## Set up features. +echo "$0: feature type is raw" +feats="ark,s,cs:utils/filter_scp.pl --exclude $dir/valid_uttlist $sdata/JOB/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:- ark:- |" +valid_feats="ark,s,cs:utils/filter_scp.pl $dir/valid_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- |" +train_subset_feats="ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- |" +echo $cmvn_opts >$dir/cmvn_opts # caution: the top-level nnet training script should copy this to its own dir now. + +if [ -f $dir/trans.scp ]; then + feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk scp:$dir/trans.scp ark:- ark:- |" + valid_feats="$valid_feats transform-feats --utt2spk=ark:$data/utt2spk scp:$dir/trans.scp ark:- ark:- |" + train_subset_feats="$train_subset_feats transform-feats --utt2spk=ark:$data/utt2spk scp:$dir/trans.scp ark:- ark:- |" +fi + +tree-info $chaindir/tree | grep num-pdfs | awk '{print $2}' > $dir/info/num_pdfs || exit 1 + +if [ ! -z "$online_ivector_dir" ]; then + ivector_dim=$(feat-to-dim scp:$online_ivector_dir/ivector_online.scp -) || exit 1; + echo $ivector_dim > $dir/info/ivector_dim + steps/nnet2/get_ivector_id.sh $online_ivector_dir > $dir/info/final.ie.id || exit 1 + ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1; + ivector_opts="--online-ivectors=scp:$online_ivector_dir/ivector_online.scp --online-ivector-period=$ivector_period" +else + ivector_opts="" + echo 0 >$dir/info/ivector_dim +fi + +if [ $stage -le 1 ]; then + echo "$0: working out number of frames of training data" + num_frames=$(steps/nnet2/get_num_frames.sh $data) + echo $num_frames > $dir/info/num_frames + echo "$0: working out feature dim" + feats_one="$(echo $feats | sed s/JOB/1/g)" + if ! feat_dim=$(feat-to-dim "$feats_one" - 2>/dev/null); then + echo "Command failed (getting feature dim): feat-to-dim \"$feats_one\"" + exit 1 + fi + echo $feat_dim > $dir/info/feat_dim +else + num_frames=$(cat $dir/info/num_frames) || exit 1; + feat_dim=$(cat $dir/info/feat_dim) || exit 1; +fi + +# the + 1 is to round up, not down... we assume it doesn't divide exactly. +num_archives=$[$num_frames/$frames_per_iter+1] + +# We may have to first create a smaller number of larger archives, with number +# $num_archives_intermediate, if $num_archives is more than the maximum number +# of open filehandles that the system allows per process (ulimit -n). +# This sometimes gives a misleading answer as GridEngine sometimes changes the +# limit, so we limit it to 512. +max_open_filehandles=$(ulimit -n) || exit 1 +[ $max_open_filehandles -gt 512 ] && max_open_filehandles=512 +num_archives_intermediate=$num_archives +archives_multiple=1 +while [ $[$num_archives_intermediate+4] -gt $max_open_filehandles ]; do + archives_multiple=$[$archives_multiple+1] + num_archives_intermediate=$[$num_archives/$archives_multiple] || exit 1; +done +# now make sure num_archives is an exact multiple of archives_multiple. +num_archives=$[$archives_multiple*$num_archives_intermediate] || exit 1; + +echo $num_archives >$dir/info/num_archives +echo $frames_per_eg >$dir/info/frames_per_eg +# Work out the number of egs per archive +egs_per_archive=$[$num_frames/($frames_per_eg*$num_archives)] || exit 1; +! [ $egs_per_archive -le $frames_per_iter ] && \ + echo "$0: script error: egs_per_archive=$egs_per_archive not <= frames_per_iter=$frames_per_iter" \ + && exit 1; + +echo $egs_per_archive > $dir/info/egs_per_archive + +echo "$0: creating $num_archives archives, each with $egs_per_archive egs, with" +echo "$0: $frames_per_eg labels per example, and (left,right) context = ($left_context,$right_context)" +if [ $left_context_initial -ge 0 ] || [ $right_context_final -ge 0 ]; then + echo "$0: ... and (left-context-initial,right-context-final) = ($left_context_initial,$right_context_final)" +fi + + +if [ -e $dir/storage ]; then + # Make soft links to storage directories, if distributing this way.. See + # utils/create_split_dir.pl. + echo "$0: creating data links" + utils/create_data_link.pl $(for x in $(seq $num_archives); do echo $dir/cegs.$x.ark; done) + for x in $(seq $num_archives_intermediate); do + utils/create_data_link.pl $(for y in $(seq $nj); do echo $dir/cegs_orig.$y.$x.ark; done) + done +fi + +if [ $stage -le 2 ]; then + echo "$0: copying training lattices" + + [ ! -z $lattice_prune_beam ] && \ + prune_cmd="ark:- | lattice-prune --write-compact=false --acoustic-scale=$acwt --beam=$lattice_prune_beam ark:-" + $cmd --max-jobs-run 6 JOB=1:$num_lat_jobs $dir/log/lattice_copy.JOB.log \ + lattice-copy --write-compact=false "ark:gunzip -c $latdir/lat.JOB.gz|" $prune_cmd ark,scp:$dir/lat.JOB.ark,$dir/lat.JOB.scp || exit 1; + + for id in $(seq $num_lat_jobs); do cat $dir/lat.$id.scp; done > $dir/lat.scp +fi + + +egs_opts="--left-context=$left_context --right-context=$right_context --num-frames=$frames_per_eg --frame-subsampling-factor=$frame_subsampling_factor --compress=$compress" +[ $left_context_initial -ge 0 ] && egs_opts="$egs_opts --left-context-initial=$left_context_initial" +[ $right_context_final -ge 0 ] && egs_opts="$egs_opts --right-context-final=$right_context_final" + +[ ! -z "$deriv_weights_scp" ] && egs_opts="$egs_opts --deriv-weights-rspecifier=scp:$deriv_weights_scp" + +chain_supervision_all_opts="--supervision.frame-subsampling-factor=$alignment_subsampling_factor" +[ ! -z $right_tolerance ] && \ + chain_supervision_all_opts="$chain_supervision_all_opts --supervision.right-tolerance=$right_tolerance" + +[ ! -z $left_tolerance ] && \ + chain_supervision_all_opts="$chain_supervision_all_opts --supervision.left-tolerance=$left_tolerance" + +normalization_scale=1.0 +if [ ! -z "$lattice_lm_scale" ]; then + chain_supervision_all_opts="$chain_supervision_all_opts --supervision.lm-scale=$lattice_lm_scale" + normalization_scale=$(perl -e " + if ($lattice_lm_scale >= 1.0 || $lattice_lm_scale < 0) { + print STDERR \"Invalid --lattice-lm-scale $lattice_lm_scale\"; + exit(1); + } + print (1.0 - $lattice_lm_scale);") +fi + +[ ! -z $phone_insertion_penalty ] && \ + chain_supervision_all_opts="$chain_supervision_all_opts --supervision.phone-ins-penalty=$phone_insertion_penalty" + +[ ! -z $right_tolerance_silence ] && \ + chain_supervision_all_opts="$chain_supervision_all_opts --supervision.right-tolerance-silence=$right_tolerance_silence" + +[ ! -z $left_tolerance_silence ] && \ + chain_supervision_all_opts="$chain_supervision_all_opts --supervision.left-tolerance-silence=$left_tolerance_silence" + +if [ ! -z $left_tolerance_silence ] && [ ! -z $right_tolerance_silence ]; then + chain_supervision_all_opts="$chain_supervision_all_opts --supervision.silence-phones=$(cat $lang/phones/silence_phones.csl)" +fi + +echo $left_context > $dir/info/left_context +echo $right_context > $dir/info/right_context +echo $left_context_initial > $dir/info/left_context_initial +echo $right_context_final > $dir/info/right_context_final + +if [ $stage -le 3 ]; then + echo "$0: Getting validation and training subset examples." + rm $dir/.error 2>/dev/null + echo "$0: ... extracting validation and training-subset alignments." + + # do the filtering just once, as lat.scp may be long. + utils/filter_scp.pl <(cat $dir/valid_uttlist $dir/train_subset_uttlist) \ + <$dir/lat.scp >$dir/lat_special.scp + + $cmd $dir/log/create_valid_subset.log \ + utils/filter_scp.pl $dir/valid_uttlist $dir/lat_special.scp \| \ + lattice-align-phones --write-compact=false --replace-output-symbols=true $latdir/final.mdl scp:- ark:- \| \ + nnet3-chain-split-and-get-egs $chain_supervision_all_opts $ivector_opts --srand=$srand \ + $egs_opts $chaindir/normalization.fst \ + "$valid_feats" $chaindir/tree $chaindir/0.trans_mdl \ + ark,s,cs:- "ark:$dir/valid_all.cegs" || touch $dir/.error & + $cmd $dir/log/create_train_subset.log \ + utils/filter_scp.pl $dir/train_subset_uttlist $dir/lat_special.scp \| \ + lattice-align-phones --write-compact=false --replace-output-symbols=true $latdir/final.mdl scp:- ark:- \| \ + nnet3-chain-split-and-get-egs $chain_supervision_all_opts $ivector_opts --srand=$srand \ + $egs_opts $chaindir/normalization.fst \ + "$train_subset_feats" $chaindir/tree $chaindir/0.trans_mdl \ + ark,s,cs:- "ark:$dir/train_subset_all.cegs" || touch $dir/.error & + wait; + [ -f $dir/.error ] && echo "Error detected while creating train/valid egs" && exit 1 + echo "... Getting subsets of validation examples for diagnostics and combination." + if $generate_egs_scp; then + valid_diagnostic_output="ark,scp:$dir/valid_diagnostic.cegs,$dir/valid_diagnostic.scp" + train_diagnostic_output="ark,scp:$dir/train_diagnostic.cegs,$dir/train_diagnostic.scp" + else + valid_diagnostic_output="ark:$dir/valid_diagnostic.cegs" + train_diagnostic_output="ark:$dir/train_diagnostic.cegs" + fi + $cmd $dir/log/create_valid_subset_combine.log \ + nnet3-chain-subset-egs --n=$num_valid_egs_combine ark:$dir/valid_all.cegs \ + ark:$dir/valid_combine.cegs || touch $dir/.error & + $cmd $dir/log/create_valid_subset_diagnostic.log \ + nnet3-chain-subset-egs --n=$num_egs_diagnostic ark:$dir/valid_all.cegs \ + $valid_diagnostic_output || touch $dir/.error & + + $cmd $dir/log/create_train_subset_combine.log \ + nnet3-chain-subset-egs --n=$num_train_egs_combine ark:$dir/train_subset_all.cegs \ + ark:$dir/train_combine.cegs || touch $dir/.error & + $cmd $dir/log/create_train_subset_diagnostic.log \ + nnet3-chain-subset-egs --n=$num_egs_diagnostic ark:$dir/train_subset_all.cegs \ + $train_diagnostic_output || touch $dir/.error & + wait + sleep 5 # wait for file system to sync. + if $generate_egs_scp; then + cat $dir/valid_combine.cegs $dir/train_combine.cegs | \ + nnet3-chain-copy-egs ark:- ark,scp:$dir/combine.cegs,$dir/combine.scp + rm $dir/{train,valid}_combine.scp + else + cat $dir/valid_combine.cegs $dir/train_combine.cegs > $dir/combine.cegs + fi + + for f in $dir/{combine,train_diagnostic,valid_diagnostic}.cegs; do + [ ! -s $f ] && echo "No examples in file $f" && exit 1; + done + rm $dir/valid_all.cegs $dir/train_subset_all.cegs $dir/{train,valid}_combine.cegs +fi + +if [ $stage -le 4 ]; then + # create cegs_orig.*.*.ark; the first index goes to $nj, + # the second to $num_archives_intermediate. + + egs_list= + for n in $(seq $num_archives_intermediate); do + egs_list="$egs_list ark:$dir/cegs_orig.JOB.$n.ark" + done + echo "$0: Generating training examples on disk" + + # The examples will go round-robin to egs_list. Note: we omit the + # 'normalization.fst' argument while creating temporary egs: the phase of egs + # preparation that involves the normalization FST is quite CPU-intensive and + # it's more convenient to do it later, in the 'shuffle' stage. Otherwise to + # make it efficient we need to use a large 'nj', like 40, and in that case + # there can be too many small files to deal with, because the total number of + # files is the product of 'nj' by 'num_archives_intermediate', which might be + # quite large. + $cmd JOB=1:$nj $dir/log/get_egs.JOB.log \ + utils/filter_scp.pl $sdata/JOB/utt2spk $dir/lat.scp \| \ + lattice-align-phones --write-compact=false --replace-output-symbols=true $latdir/final.mdl scp:- ark:- \| \ + nnet3-chain-split-and-get-egs $chain_supervision_all_opts $ivector_opts --srand=\$[JOB+$srand] $egs_opts --supervision.weight=$egs_weight \ + --num-frames-overlap=$frames_overlap_per_eg \ + "$feats" $chaindir/tree $chaindir/0.trans_mdl \ + ark,s,cs:- ark:- \| \ + nnet3-chain-copy-egs --random=true --srand=\$[JOB+$srand] ark:- $egs_list || exit 1; +fi + +if [ $stage -le 5 ]; then + echo "$0: recombining and shuffling order of archives on disk" + # combine all the "egs_orig.*.JOB.scp" (over the $nj splits of the data) and + # shuffle the order, writing to the egs.JOB.ark + + # the input is a concatenation over the input jobs. + egs_list= + for n in $(seq $nj); do + egs_list="$egs_list $dir/cegs_orig.$n.JOB.ark" + done + + if [ $archives_multiple == 1 ]; then # normal case. + if $generate_egs_scp; then + output_archive="ark,scp:$dir/cegs.JOB.ark,$dir/cegs.JOB.scp" + else + output_archive="ark:$dir/cegs.JOB.ark" + fi + $cmd --max-jobs-run $max_shuffle_jobs_run --mem 8G JOB=1:$num_archives_intermediate $dir/log/shuffle.JOB.log \ + nnet3-chain-normalize-egs --normalization-scale=$normalization_scale $chaindir/normalization.fst "ark:cat $egs_list|" ark:- \| \ + nnet3-chain-shuffle-egs --srand=\$[JOB+$srand] ark:- $output_archive || exit 1; + + if $generate_egs_scp; then + #concatenate cegs.JOB.scp in single cegs.scp + rm -rf $dir/cegs.scp + for j in $(seq $num_archives_intermediate); do + cat $dir/cegs.$j.scp || exit 1; + done > $dir/cegs.scp || exit 1; + for f in $dir/cegs.*.scp; do rm $f; done + fi + else + # we need to shuffle the 'intermediate archives' and then split into the + # final archives. we create soft links to manage this splitting, because + # otherwise managing the output names is quite difficult (and we don't want + # to submit separate queue jobs for each intermediate archive, because then + # the --max-jobs-run option is hard to enforce). + if $generate_egs_scp; then + output_archives="$(for y in $(seq $archives_multiple); do echo ark,scp:$dir/cegs.JOB.$y.ark,$dir/cegs.JOB.$y.scp; done)" + else + output_archives="$(for y in $(seq $archives_multiple); do echo ark:$dir/cegs.JOB.$y.ark; done)" + fi + for x in $(seq $num_archives_intermediate); do + for y in $(seq $archives_multiple); do + archive_index=$[($x-1)*$archives_multiple+$y] + # egs.intermediate_archive.{1,2,...}.ark will point to egs.archive.ark + ln -sf cegs.$archive_index.ark $dir/cegs.$x.$y.ark || exit 1 + done + done + $cmd --max-jobs-run $max_shuffle_jobs_run --mem 8G JOB=1:$num_archives_intermediate $dir/log/shuffle.JOB.log \ + nnet3-chain-normalize-egs --normalization-scale=$normalization_scale $chaindir/normalization.fst "ark:cat $egs_list|" ark:- \| \ + nnet3-chain-shuffle-egs --srand=\$[JOB+$srand] ark:- ark:- \| \ + nnet3-chain-copy-egs ark:- $output_archives || exit 1; + + if $generate_egs_scp; then + #concatenate cegs.JOB.scp in single cegs.scp + rm -rf $dir/cegs.scp + for j in $(seq $num_archives_intermediate); do + for y in $(seq $num_archives_intermediate); do + cat $dir/cegs.$j.$y.scp || exit 1; + done + done > $dir/cegs.scp || exit 1; + for f in $dir/cegs.*.*.scp; do rm $f; done + fi + fi +fi + +if [ $stage -le 6 ]; then + echo "$0: removing temporary archives" + ( + cd $dir + for f in $(ls -l . | grep 'cegs_orig' | awk '{ X=NF-1; Y=NF-2; if ($X == "->") print $Y, $NF; }'); do rm $f; done + # the next statement removes them if we weren't using the soft links to a + # 'storage' directory. + rm cegs_orig.*.ark 2>/dev/null + ) + if [ $archives_multiple -gt 1 ]; then + # there are some extra soft links that we should delete. + for f in $dir/cegs.*.*.ark; do rm $f; done + fi + echo "$0: removing temporary lattices" + rm $dir/lat.* + echo "$0: removing temporary alignments and transforms" + # Ignore errors below because trans.* might not exist. + rm $dir/{ali,trans}.{ark,scp} 2>/dev/null + +fi + +echo "$0: Finished preparing training examples" + diff --git a/egs/wsj/s5/steps/nnet3/chain/make_weighted_den_fst.sh b/egs/wsj/s5/steps/nnet3/chain/make_weighted_den_fst.sh index 493353c675b..7dade75a0ed 100755 --- a/egs/wsj/s5/steps/nnet3/chain/make_weighted_den_fst.sh +++ b/egs/wsj/s5/steps/nnet3/chain/make_weighted_den_fst.sh @@ -92,19 +92,18 @@ if [ $stage -le 1 ]; then # option for n in `seq 0 $[num_alignments-1]`; do this_num_repeats=${num_repeats_array[$n]} - this_alignment_adir=${ali_dirs[$n]} - num_jobs=$(cat $this_alignment_adir/num_jobs) + this_alignment_dir=${ali_dirs[$n]} + num_jobs=$(cat $this_alignment_dir/num_jobs) if ! [ "$this_num_repeats" -gt 0 ]; then echo "Expected comma-separated list of integers for --num-repeats option, got '$num_repeats'" exit 1 fi - for j in $(seq $num_jobs); do gunzip -c $this_alignment_adir/ali.$j.gz; done | \ + for j in $(seq $num_jobs); do gunzip -c $this_alignment_dir/ali.$j.gz; done | \ ali-to-phones $this_alignment_dir/final.mdl ark:- "ark:|gzip -c >$dir/phones.$n.gz" || exit 1; - all_phones="$all_phones $(for r in $(seq $num_repeats); do echo $dir/phones.$n.gz; done)" - done + all_phones="$all_phones $(for r in $(seq $this_num_repeats); do echo $dir/phones.$n.gz; done)" done $cmd $dir/log/make_phone_lm_fst.log \ diff --git a/egs/wsj/s5/steps/nnet3/decode.sh b/egs/wsj/s5/steps/nnet3/decode.sh index 41791606c31..275355d6695 100755 --- a/egs/wsj/s5/steps/nnet3/decode.sh +++ b/egs/wsj/s5/steps/nnet3/decode.sh @@ -32,6 +32,7 @@ extra_left_context_initial=-1 extra_right_context_final=-1 online_ivector_dir= minimize=false +write_compact=true # End configuration section. echo "$0 $@" # Print the command line for logging @@ -118,10 +119,17 @@ if [ ! -z "$online_ivector_dir" ]; then ivector_opts="--online-ivectors=scp:$online_ivector_dir/ivector_online.scp --online-ivector-period=$ivector_period" fi +opts= +lat_wspecifier="ark:|" +if ! $write_compact; then + opts="--determinize-lattice=false" + lat_wspecifier="ark:| lattice-determinize-phone-pruned-non-compact --beam=$lattice_beam --acoustic-scale=$acwt --minimize=$minimize $model ark:- ark:- |" +fi + if [ "$post_decode_acwt" == 1.0 ]; then - lat_wspecifier="ark:|gzip -c >$dir/lat.JOB.gz" + lat_wspecifier="$lat_wspecifier gzip -c >$dir/lat.JOB.gz" else - lat_wspecifier="ark:|lattice-scale --acoustic-scale=$post_decode_acwt ark:- ark:- | gzip -c >$dir/lat.JOB.gz" + lat_wspecifier="$lat_wspecifier lattice-scale --acoustic-scale=$post_decode_acwt --write-compact=$write_compact ark:- ark:- | gzip -c >$dir/lat.JOB.gz" fi frame_subsampling_opt= @@ -140,7 +148,8 @@ if [ $stage -le 1 ]; then --extra-right-context-final=$extra_right_context_final \ --minimize=$minimize --max-active=$max_active --min-active=$min_active --beam=$beam \ --lattice-beam=$lattice_beam --acoustic-scale=$acwt --allow-partial=true \ - --word-symbol-table=$graphdir/words.txt "$model" \ + --word-symbol-table=$graphdir/words.txt ${opts} \ + "$model" \ $graphdir/HCLG.fst "$feats" "$lat_wspecifier" || exit 1; fi diff --git a/egs/wsj/s5/steps/nnet3/get_degs.sh b/egs/wsj/s5/steps/nnet3/get_degs.sh index 63271e3adaa..d847246ca14 100755 --- a/egs/wsj/s5/steps/nnet3/get_degs.sh +++ b/egs/wsj/s5/steps/nnet3/get_degs.sh @@ -334,7 +334,7 @@ fi # set the command to determinize lattices, if specified. if $determinize_before_split; then - lattice_determinize_cmd="lattice-determinize-pruned-non-compact --acoustic-scale=$acwt --max-mem=$max_mem --minimize=true --beam=$lattice_beam ark:- ark:-" + lattice_determinize_cmd="lattice-determinize-phone-pruned-non-compact --acoustic-scale=$acwt --max-mem=$max_mem --minimize=true --beam=$lattice_beam $dir/final.mdl ark:- ark:-" else lattice_determinize_cmd="cat" fi diff --git a/src/chain/Makefile b/src/chain/Makefile index ca23450ae50..bc89da04e57 100644 --- a/src/chain/Makefile +++ b/src/chain/Makefile @@ -5,11 +5,11 @@ include ../kaldi.mk LDFLAGS += $(CUDA_LDFLAGS) LDLIBS += $(CUDA_LDLIBS) -TESTFILES = chain-supervision-test language-model-test +TESTFILES = chain-supervision-test language-model-test chain-supervision-splitter-test OBJFILES = chain-supervision.o chain-numerator.o chain-den-graph.o \ language-model.o chain-denominator.o chain-training.o \ - chain-denominator-smbr.o + chain-denominator-smbr.o chain-supervision-splitter.o ifeq ($(CUDA), true) OBJFILES += chain-kernels.o chain-smbr-kernels.o endif diff --git a/src/chain/chain-supervision-splitter-test.cc b/src/chain/chain-supervision-splitter-test.cc new file mode 100644 index 00000000000..c62edafed8a --- /dev/null +++ b/src/chain/chain-supervision-splitter-test.cc @@ -0,0 +1,228 @@ +// chain/chain-supervision-splitter-test.cc + +// Copyright 2015 Johns Hopkins University (author: Daniel Povey) +// 2017 Vimal Manohar + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include "chain/chain-supervision-splitter.h" +#include "chain/chain-supervision.h" +#include "fstext/fstext-lib.h" +#include "hmm/hmm-test-utils.h" +#include "hmm/hmm-utils.h" +#include +#include "fstext/kaldi-fst-io.h" +#include "lat/lattice-functions.h" + +namespace kaldi { +namespace chain { + + +void FstToLabels(const fst::StdVectorFst &fst, + std::vector > *labels) { + std::vector state_times; + int32 num_frames = ComputeFstStateTimes(fst, &state_times); + + typedef fst::StdArc::Weight Weight; + typedef fst::StdArc::StateId StateId; + typedef fst::StdArc::Label Label; + + std::vector > temp_labels(num_frames); + labels->clear(); + labels->resize(num_frames); + + for (StateId s = 0; s < fst.NumStates(); s++) { + for (fst::ArcIterator aiter(fst, s); + !aiter.Done(); aiter.Next()) { + const fst::StdArc &arc = aiter.Value(); + + int32 t = state_times[s]; + KALDI_ASSERT(arc.ilabel == arc.olabel && arc.ilabel != 0); + + temp_labels[t].insert(arc.olabel); + } + } + + int32 t = 0; + for (std::vector >::const_iterator it = temp_labels.begin(); + it != temp_labels.end(); ++it, t++) { + (*labels)[t].Init(*it); + } +} + +void TestSupervisionLatticeSplitting( + const SupervisionOptions &sup_opts, + const fst::StdVectorFst &tolerance_fst, + const TransitionModel &trans_model, + Lattice &lat) { + + fst::TopSort(&lat); + + chain::SupervisionLatticeSplitterOptions opts; + chain::SupervisionLatticeSplitter sup_lat_splitter( + opts, trans_model, lat); + + std::vector state_times; + int32 num_frames_lat = LatticeStateTimes(lat, &state_times); + + Posterior post; + LatticeForwardBackward(lat, &post); + + KALDI_ASSERT(num_frames_lat == post.size()); + + std::vector > pdfs(post.size()); + for (size_t i = 0; i < post.size(); i++) { + std::vector this_pdfs; + for (size_t j = 0; j < post[i].size(); j++) { + this_pdfs.push_back(trans_model.TransitionIdToPdf(post[i][j].first) + 1); + } + pdfs[i].Init(this_pdfs); + } + + for (int32 i = 0; i < 3; i++) { + int32 start_frame = RandInt(0, num_frames_lat - 1), + num_frames = RandInt(1,10); + + if (start_frame + num_frames > num_frames_lat) { + num_frames = num_frames_lat - start_frame; + } + + Lattice lat_part; + sup_lat_splitter.GetFrameRange(start_frame, num_frames, &lat_part); + + ScaleLattice(fst::LatticeScale(1.0, 0.0), &lat_part); + + chain::Supervision supervision_part; + + chain::PhoneLatticeToSupervision(tolerance_fst, + trans_model, lat_part, + &supervision_part); + + std::vector > labels; + FstToLabels(supervision_part.fst, &labels); + + KALDI_ASSERT(labels.size() == num_frames); + + for (int32 t = 0; t < labels.size(); t++) { + for (ConstIntegerSet::iterator it = labels[t].begin(); + it != labels[t].end(); ++it) { + // To check that each label is a pdf (1-indexed) within the tolerance + // in the original + bool label_in_original = false; + for (int32 n = std::max(start_frame + t - sup_opts.left_tolerance, 0); + n <= std::min(start_frame + t + sup_opts.right_tolerance, num_frames_lat - 1); + n++) { + if (pdfs[n].count(*it)) { + label_in_original = true; + break; + } + } + KALDI_ASSERT(label_in_original); + } + } + + std::vector self_loop_pdfs_list; + for (int32 tid = 1; tid <= trans_model.NumTransitionIds(); tid++) { + if (trans_model.IsSelfLoop(tid)) { + int32 tstate = trans_model.TransitionIdToTransitionState(tid); + int32 pdf = trans_model.TransitionStateToSelfLoopPdf(tstate); + self_loop_pdfs_list.push_back(pdf); + } + } + + ConstIntegerSet self_loop_pdfs(self_loop_pdfs_list); + + // To check that each self-loop pdf in the original is contained as a label + // in at least 2 of the tolerance values of the split lattices. + for (int32 n = start_frame; n < start_frame + num_frames; n++) { + for (ConstIntegerSet::iterator it = pdfs[n].begin(); + it != pdfs[n].end(); ++it) { + if (!self_loop_pdfs.count(*it - 1)) continue; // Ignore forward pdfs + int32 pdf_count = 0; + for (int32 t = std::max(n - start_frame - sup_opts.left_tolerance, 0); + t <= std::min(n - start_frame + sup_opts.right_tolerance, num_frames - 1); t++) { + pdf_count += labels[t].count(*it); + } + //KALDI_ASSERT(pdf_count > 1); + } + } + } +} + +void ChainSupervisionSplitterTest(int32 index) { + ContextDependency *ctx_dep; + TransitionModel *trans_model = GenRandTransitionModel(&ctx_dep, 2); + const std::vector &phones = trans_model->GetPhones(); + + int32 subsample_factor = 1; + + int32 phone_sequence_length = RandInt(1, 10); + + CompactLattice clat; + int32 cur_state = clat.AddState(); + clat.SetStart(cur_state); + + bool reorder = true; + + int32 num_frames_subsampled = 0; + for (int32 i = 0; i < phone_sequence_length; i++) { + int32 phone = phones[RandInt(0, phones.size() - 1)]; + int32 next_state = clat.AddState(); + + std::vector tids; + GenerateRandomAlignment(*ctx_dep, *trans_model, reorder, + std::vector(1, phone), &tids); + clat.AddArc(cur_state, + CompactLatticeArc(phone, phone, + CompactLatticeWeight(LatticeWeight::One(), + tids), next_state)); + cur_state = next_state; + num_frames_subsampled += tids.size(); + } + clat.SetFinal(cur_state, CompactLatticeWeight::One()); + + Lattice lat; + fst::ConvertLattice(clat, &lat); + + chain::SupervisionOptions sup_opts; + sup_opts.left_tolerance = 1; + sup_opts.right_tolerance = 1; + sup_opts.frame_subsampling_factor = subsample_factor; + sup_opts.lm_scale = 0.5; + + fst::StdVectorFst tolerance_fst; + MakeToleranceEnforcerFst(sup_opts, *trans_model, &tolerance_fst); + WriteFstKaldi(std::cerr, false, tolerance_fst); + + fst::ArcSort(&tolerance_fst, fst::ILabelCompare()); + + TestSupervisionLatticeSplitting(sup_opts, tolerance_fst, *trans_model, lat); + + delete ctx_dep; + delete trans_model; +} + +} // namespace chain +} // namespace kaldi + +int main() { + using namespace kaldi; + SetVerboseLevel(2); + for (int32 i = 0; i < 10; i++) { + kaldi::chain::ChainSupervisionSplitterTest(i); + } + //kaldi::chain::TestRanges(); +} diff --git a/src/chain/chain-supervision-splitter.cc b/src/chain/chain-supervision-splitter.cc new file mode 100644 index 00000000000..0a07876a1a2 --- /dev/null +++ b/src/chain/chain-supervision-splitter.cc @@ -0,0 +1,481 @@ +// chain/chain-supervision-splitter.cc + +// Copyright 2012-2015 Johns Hopkins University (author: Daniel Povey) +// 2014-2015 Vimal Manohar +// 2017 Vimal Manohar + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include "chain/chain-supervision-splitter.h" +#include "chain/chain-supervision.h" +#include "lat/lattice-functions.h" + +namespace kaldi { +namespace chain { + +typedef fst::ArcTpl LatticeArc; +typedef fst::VectorFst Lattice; + +SupervisionLatticeSplitter::SupervisionLatticeSplitter( + const SupervisionLatticeSplitterOptions &opts, + const TransitionModel &trans_model, const Lattice &lat): + opts_(opts), trans_model_(trans_model), lat_(lat) { + PrepareLattice(); + + int32 num_states = lat_.NumStates(); + + KALDI_ASSERT(num_states > 0); // TODO: Might have to be skipped instead. + int32 start_state = lat_.Start(); + + // Lattice should be top-sorted and connected, so start-state must be 0. + KALDI_ASSERT(start_state == 0 && "Expecting start-state to be 0"); + + KALDI_ASSERT(num_states == lat_scores_.state_times.size()); + KALDI_ASSERT(lat_scores_.state_times[start_state] == 0); +} + +void SupervisionLatticeSplitter::GetFrameRange( + int32 begin_frame, int32 num_frames, + Lattice *lat_out) const { + int32 end_frame = begin_frame + num_frames; + // Note: end_frame is not included in the range of frames that the + // output supervision object covers; it's one past the end. + KALDI_ASSERT(num_frames > 0 && begin_frame >= 0 && + begin_frame + num_frames <= lat_scores_.state_times.back()); + + CreateRangeLattice(begin_frame, end_frame, lat_out); + + if (opts_.acoustic_scale != 1.0) { + fst::ScaleLattice(fst::AcousticLatticeScale( + 1.0 / opts_.acoustic_scale), lat_out); + } +} + +void SupervisionLatticeSplitter::LatticeInfo::Check() const { + // Check if all the vectors are of size num_states + KALDI_ASSERT(state_times.size() == alpha.size() && + state_times.size() == beta.size()); + + // Check that the states are ordered in increasing order of state_times. + // This must be true since the states are in breadth-first search order. + KALDI_ASSERT(IsSorted(state_times)); +} + +void SupervisionLatticeSplitter::PrepareLattice() { + // Scale the lattice to appropriate acoustic scale. It is important to + // ensure this is equal to the acoustic scale used while training. This is + // because, on splitting lattices, the initial and final costs are added + // into the graph cost. + KALDI_ASSERT(opts_.acoustic_scale != 0.0); + if (opts_.acoustic_scale != 1.0) + fst::ScaleLattice(fst::AcousticLatticeScale( + opts_.acoustic_scale), &lat_); + + KALDI_ASSERT(fst::TopSort(&lat_)); + LatticeStateTimes(lat_, &(lat_scores_.state_times)); + int32 num_states = lat_.NumStates(); + std::vector > state_time_indexes(num_states); + for (int32 s = 0; s < num_states; s++) { + state_time_indexes[s] = std::make_pair(lat_scores_.state_times[s], s); + } + + // Order the states based on the state times. This is stronger than just + // topological sort. This is required by the lattice splitting code. + std::sort(state_time_indexes.begin(), state_time_indexes.end()); + + std::vector state_order(num_states); + for (int32 s = 0; s < num_states; s++) { + state_order[state_time_indexes[s].second] = s; + } + + fst::StateSort(&lat_, state_order); + ComputeLatticeScores(); +} + +void SupervisionLatticeSplitter::CreateRangeLattice( + int32 begin_frame, int32 end_frame, + Lattice *out_lat) const { + typedef Lattice::StateId StateId; + typedef LatticeArc::Label Label; + + const std::vector &state_times = lat_scores_.state_times; + + // Some checks to ensure the lattice and scores are prepared properly + KALDI_ASSERT(state_times.size() == lat_.NumStates()); + if (!lat_.Properties(fst::kTopSorted, true)) + KALDI_ERR << "Input lattice must be topologically sorted."; + + std::vector::const_iterator begin_iter = + std::lower_bound(state_times.begin(), state_times.end(), begin_frame), + end_iter = std::lower_bound(begin_iter, + state_times.end(), end_frame); + + KALDI_ASSERT(*begin_iter == begin_frame && + (begin_iter == state_times.begin() || + begin_iter[-1] < begin_frame)); + // even if end_frame == supervision_.num_frames, there should be a state with + // that frame index. + KALDI_ASSERT(end_iter[-1] < end_frame && + (end_iter < state_times.end() || *end_iter == end_frame)); + StateId begin_state = begin_iter - state_times.begin(), + end_state = end_iter - state_times.begin(); + + KALDI_ASSERT(end_state > begin_state); + out_lat->DeleteStates(); + out_lat->ReserveStates(end_state - begin_state + 2); + + // Add special start state + StateId start_state = out_lat->AddState(); + out_lat->SetStart(start_state); + + for (StateId i = begin_state; i < end_state; i++) + out_lat->AddState(); + + // Add the special final-state. + StateId final_state = out_lat->AddState(); + out_lat->SetFinal(final_state, LatticeWeight::One()); + + for (StateId state = begin_state; state < end_state; state++) { + StateId output_state = state - begin_state + 1; + if (state_times[state] == begin_frame) { + // we'd like to make this an initial state, but OpenFst doesn't allow + // multiple initial states. Instead we add an epsilon transition to it + // from our actual initial state. The weight on this + // transition is the forward probability of the said 'initial state' + LatticeWeight weight = LatticeWeight::One(); + weight.SetValue1((opts_.normalize ? lat_scores_.beta[0] : 0.0) - lat_scores_.alpha[state]); + // Add negative of the forward log-probability to the graph cost score, + // since the acoustic scores would be changed later. + // Assuming that the lattice is scaled with appropriate acoustic + // scale. + // We additionally normalize using the total lattice score. Since the + // same score is added as normalizer to all the paths in the lattice, + // the relative probabilities of the paths in the lattice is not affected. + // Note: Doing a forward-backward on this split must result in a total + // score of 0 because of the normalization. + + out_lat->AddArc(start_state, + LatticeArc(0, 0, weight, output_state)); + } else { + KALDI_ASSERT(lat_scores_.state_times[state] < end_frame); + } + for (fst::ArcIterator aiter(lat_, state); + !aiter.Done(); aiter.Next()) { + const LatticeArc &arc = aiter.Value(); + StateId nextstate = arc.nextstate; + if (nextstate >= end_state) { + // A transition to any state outside the range becomes a transition to + // our special final-state. + // The weight is just the negative of the backward log-probability + + // the arc cost. We again normalize with the total lattice score. + LatticeWeight weight; + //KALDI_ASSERT(lat_scores_.beta[state] < 0); + weight.SetValue1(arc.weight.Value1() - lat_scores_.beta[nextstate]); + weight.SetValue2(arc.weight.Value2()); + // Add negative of the backward log-probability to the LM score, since + // the acoustic scores would be changed later. + // Note: We don't normalize here because that is already done with the + // initial cost. + + out_lat->AddArc(output_state, + LatticeArc(arc.ilabel, arc.olabel, weight, final_state)); + } else { + StateId output_nextstate = nextstate - begin_state + 1; + + if (opts_.add_phone_label_for_half_transition) { + int32 tid = arc.ilabel; + int32 phone = trans_model_.TransitionIdToPhone(tid); + + Label olabel = arc.olabel; + + if (olabel == 0) { + olabel = phone; + } else { + KALDI_ASSERT(phone == olabel); + } + out_lat->AddArc(output_state, + LatticeArc(arc.ilabel, olabel, arc.weight, output_nextstate)); + } else { + out_lat->AddArc(output_state, + LatticeArc(arc.ilabel, arc.olabel, arc.weight, output_nextstate)); + } + } + } + } +} + +void SupervisionLatticeSplitter::ComputeLatticeScores() { + LatticeStateTimes(lat_, &(lat_scores_.state_times)); + ComputeLatticeAlphasAndBetas(lat_, false, + &(lat_scores_.alpha), &(lat_scores_.beta)); + lat_scores_.Check(); + // This check will fail if the lattice is not breadth-first search sorted +} + +class ToleranceEnforcerFstCreator { + public: + ToleranceEnforcerFstCreator( + const SupervisionOptions &opts, const TransitionModel &trans_model, + fst::StdVectorFst *fst); + + void MakeFst(); + + private: + typedef fst::StdArc::Weight Weight; + typedef fst::StdArc::StateId StateId; + typedef fst::StdArc::Label Label; + + void AddSelfLoops(int32 offset); + void AddArcToTempStates(int32 offset); + void InsertSelfLoopTransitions(int32 offset); + void DeleteSelfLoopTransitions(int32 offset); + + const SupervisionOptions &opts_; + const TransitionModel &trans_model_; + + int32 num_forward_transitions_; // number of forward transitions in the + // transition model + int32 num_offsets_; // number of offsets (tolerances) + + // The index corresponding to the zero offset. + // offset_index = offset + zero_offset_index_ + int32 zero_offset_index_; + + fst::StdVectorFst *fst_; +}; + +ToleranceEnforcerFstCreator::ToleranceEnforcerFstCreator( + const SupervisionOptions &opts, const TransitionModel &trans_model, + fst::StdVectorFst *fst): + opts_(opts), trans_model_(trans_model), fst_(fst) { + + num_forward_transitions_ = 0; + for (int32 trans_id = 1; trans_id <= trans_model_.NumTransitionIds(); + trans_id++) { + if (!trans_model_.IsSelfLoop(trans_id)) { + num_forward_transitions_++; + } + } + num_offsets_ = opts_.left_tolerance + opts_.right_tolerance + 1; + zero_offset_index_ = opts_.left_tolerance; + + fst_->DeleteStates(); +} + +void ToleranceEnforcerFstCreator::AddSelfLoops(int32 offset) { + StateId state = (offset + zero_offset_index_) * (num_forward_transitions_ + 1); + for (int32 trans_id = 1; trans_id <= trans_model_.NumTransitionIds(); + trans_id++) { + int32 pdf_id = trans_model_.TransitionIdToPdf(trans_id); + fst_->AddArc(state, + fst::StdArc(trans_id, pdf_id + 1, + fst::TropicalWeight::One(), state)); + } +} + +/* This function adds arcs from each "offset" state to a temporary state + * emitting a forward-pdf. These temporary states have arcs to states + * "offset+1" and "offset-1" (other than the boundaries). These arcs will + * be added later by the function DeleteSelfLoopTransitions and + * InsertSelfLoopTransitions. + */ +void ToleranceEnforcerFstCreator::AddArcToTempStates(int32 offset) { + StateId state = (offset + zero_offset_index_) * (num_forward_transitions_ + 1); + KALDI_ASSERT(state < fst_->NumStates()); + + int32 forward_idx = 1; + for (Label trans_id = 1; + trans_id <= trans_model_.NumTransitionIds(); + trans_id++) { + if (!trans_model_.IsSelfLoop(trans_id)) { + // Add a temporary state for each non-self loop transition + KALDI_ASSERT(forward_idx <= num_forward_transitions_); + StateId next_state = state + forward_idx; + KALDI_ASSERT(next_state < fst_->NumStates()); + int32 pdf_id = trans_model_.TransitionIdToPdf(trans_id); + + fst_->AddArc(state, + fst::StdArc(trans_id, pdf_id + 1, + fst::TropicalWeight::One(), next_state)); + forward_idx++; + } + } +} + +/* This function adds arcs out of temporary states corresponding to each offset + * offset that will delete self-loop transition-ids. Doing so will result in + * moving to the state corresponding to offset one lower. + */ +void ToleranceEnforcerFstCreator::DeleteSelfLoopTransitions(int32 offset) { + KALDI_ASSERT(offset >= -opts_.left_tolerance && offset <= opts_.right_tolerance); + + // If offset is at the left-tolerance, we cannot decrease it further. + if (offset == -opts_.left_tolerance) return; + int32 next_offset = offset - 1; + + StateId state = (offset + zero_offset_index_) * (num_forward_transitions_ + 1); + StateId next_offset_state = (next_offset + zero_offset_index_) + * (num_forward_transitions_ + 1); + + KALDI_ASSERT(state < fst_->NumStates() && next_offset_state < fst_->NumStates()); + + int32 forward_idx = 1; + for (Label trans_id = 1; + trans_id <= trans_model_.NumTransitionIds(); + trans_id++) { + if (!trans_model_.IsSelfLoop(trans_id)) { + KALDI_ASSERT(forward_idx <= num_forward_transitions_); + StateId next_state = state + forward_idx; + KALDI_ASSERT(next_state < fst_->NumStates()); + // We already added an arc to this next_state in the function + // AddArcToTempStates. Now we only need to delete a self-loop + // transition, which can be done by emitting an epsilon on the output. + + int32 tstate = trans_model_.TransitionIdToTransitionState(trans_id); + Label self_loop_tid = trans_model_.SelfLoopOf(tstate); + + fst_->AddArc(next_state, + fst::StdArc(self_loop_tid, 0, + fst::TropicalWeight::One(), next_offset_state)); + + forward_idx++; + } + } +} + +/* This function adds arcs out of temporary states corresponding to each offset + * offset that will insert self-loop transition-ids. Doing so will result in + * moving to the state corresponding to offset one higher. + */ +void ToleranceEnforcerFstCreator::InsertSelfLoopTransitions(int32 offset) { + KALDI_ASSERT(offset >= -opts_.left_tolerance && offset <= opts_.right_tolerance); + + // If offset is at the right-tolerance, we cannot increase it further. + if (offset == opts_.right_tolerance) return; + int32 next_offset = offset + 1; + + StateId state = (offset + zero_offset_index_) * (num_forward_transitions_ + 1); + StateId next_offset_state = (next_offset + zero_offset_index_) + * (num_forward_transitions_ + 1); + + KALDI_ASSERT(state < fst_->NumStates() && next_offset_state < fst_->NumStates()); + + int32 forward_idx = 1; + for (Label trans_id = 1; + trans_id <= trans_model_.NumTransitionIds(); + trans_id++) { + if (!trans_model_.IsSelfLoop(trans_id)) { + KALDI_ASSERT(forward_idx <= num_forward_transitions_); + StateId next_state = state + forward_idx; + KALDI_ASSERT(next_state < fst_->NumStates()); + // We already added an arc to this next_state in the function + // AddArcToTempStates. Now we only need to insert a self-loop + // transition, which can be done by emitting an epsilon on the input + // side with the self-loop pdf on the output. + + int32 tstate = trans_model_.TransitionIdToTransitionState(trans_id); + int32 self_loop_pdf = trans_model_.TransitionStateToSelfLoopPdf(tstate); + + fst_->AddArc(next_state, + fst::StdArc(0, self_loop_pdf + 1, + fst::TropicalWeight::One(), next_offset_state)); + + forward_idx++; + } + } +} + +void ToleranceEnforcerFstCreator::MakeFst() { + int32 num_states = num_offsets_ * (num_forward_transitions_ + 1); + fst_->ReserveStates(num_states); + + for (int32 s = 0; s < num_states; s++) + fst_->AddState(); + + StateId start_state = zero_offset_index_ * (num_forward_transitions_ + 1); + fst_->SetStart(start_state); + fst_->SetFinal(start_state, fst::TropicalWeight::One()); + + for (int32 o = -opts_.left_tolerance; o <= opts_.right_tolerance; o++) { + AddSelfLoops(o); + AddArcToTempStates(o); + DeleteSelfLoopTransitions(o); + InsertSelfLoopTransitions(o); + } + + KALDI_ASSERT(fst_->Start() == zero_offset_index_ * (num_forward_transitions_ + 1)); +} + +void MakeToleranceEnforcerFst( + const SupervisionOptions &opts, const TransitionModel &trans_model, + fst::StdVectorFst *fst) { + ToleranceEnforcerFstCreator creator(opts, trans_model, fst); + creator.MakeFst(); +} + +bool PhoneLatticeToSupervision(const fst::StdVectorFst &tolerance_fst, + const TransitionModel &trans_model, + const Lattice &lat, + chain::Supervision *supervision, + bool debug) { + fst::StdVectorFst transition_id_fst; + ConvertLattice(lat, &transition_id_fst); + Project(&transition_id_fst, fst::PROJECT_INPUT); // Keep only the transition-ids. + if (transition_id_fst.Properties(fst::kIEpsilons, true) != 0) { + // remove epsilons, if there are any. + fst::RmEpsilon(&transition_id_fst); + } + KALDI_ASSERT(transition_id_fst.NumStates() > 0); + + fst::TableComposeOptions compose_opts; + compose_opts.table_match_type = fst::MATCH_INPUT; + + TableCompose(transition_id_fst, tolerance_fst, &(supervision->fst), + compose_opts); + fst::Connect(&(supervision->fst)); + + if (debug) { + fst::Project(&(supervision->fst), fst::PROJECT_OUTPUT); + fst::RmEpsilon(&(supervision->fst)); + + return true; + } + + // at this point supervision->fst will have pdf-ids plus one as the olabels, + // but still transition-ids as the ilabels. Copy olabels to ilabels. + fst::Project(&(supervision->fst), fst::PROJECT_OUTPUT); + + fst::RmEpsilon(&(supervision->fst)); + fst::DeterminizeInLog(&(supervision->fst)); + + KALDI_ASSERT(supervision->fst.Properties(fst::kIEpsilons, true) == 0); + if (supervision->fst.NumStates() == 0) { + KALDI_WARN << "Supervision FST is empty (too many phones for too few " + << "frames?)"; + // possibly there were too many phones for too few frames. + return false; + } + + supervision->weight = 1.0; + supervision->num_sequences = 1; + supervision->frames_per_sequence = 0; + supervision->label_dim = trans_model.NumPdfs(); + SortBreadthFirstSearch(&(supervision->fst)); + return true; +} + +} // end namespace chain +} // end namespace kaldi diff --git a/src/chain/chain-supervision-splitter.h b/src/chain/chain-supervision-splitter.h new file mode 100644 index 00000000000..7e5ba7845bf --- /dev/null +++ b/src/chain/chain-supervision-splitter.h @@ -0,0 +1,121 @@ +// chain/chain-supervision-splitter.h + +// Copyright 2012-2015 Johns Hopkins University (author: Daniel Povey) +// 2014-2015 Vimal Manohar +// 2017 Vimal Manohar + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#ifndef KALDI_CHAIN_CHAIN_SUPERVISION_SPILTTER_H_ +#define KALDI_CHAIN_CHAIN_SUPERVISION_SPILTTER_H_ + +#include "hmm/transition-model.h" +#include "lat/kaldi-lattice.h" +#include "chain/chain-supervision.h" + +namespace kaldi { +namespace chain { + +typedef fst::ArcTpl LatticeArc; +typedef fst::VectorFst Lattice; + +struct SupervisionLatticeSplitterOptions { + BaseFloat acoustic_scale; + bool normalize; + bool add_phone_label_for_half_transition; + + SupervisionLatticeSplitterOptions(): + acoustic_scale(1.0), normalize(true), + add_phone_label_for_half_transition(false) { } + + void Register(OptionsItf *opts) { + opts->Register("acoustic-scale", &acoustic_scale, + "Apply acoustic scale on the lattices before splitting."); + opts->Register("normalize", &normalize, + "Normalize the initial and final scores added to split " + "lattices"); + opts->Register("add-phone-label-for-half-transition", + &add_phone_label_for_half_transition, + "Add a phone label to account for half phone transitions " + "in the split lattices"); + } +}; + +class SupervisionLatticeSplitter { + public: + SupervisionLatticeSplitter(const SupervisionLatticeSplitterOptions &opts, + const TransitionModel &trans_model, + const Lattice &lat); + + void GetFrameRange(int32 begin_frame, int32 frames_per_sequence, + Lattice *out_lat) const; + + // A structure used to store the forward and backward scores + // and state times of a lattice + struct LatticeInfo { + // These values are stored in log. + std::vector alpha; + std::vector beta; + std::vector state_times; + + void Check() const; + }; + + private: + // Creates an output lattice covering frames begin_frame <= t < end_frame, + // assuming that the corresponding state-range that we need to + // include, begin_state <= s < end_state has been included. + // (note: the output lattice will also have two special initial and final + // states). + void CreateRangeLattice(int32 begin_frame, int32 end_frame, + Lattice *out_lat) const; + + // Function to compute lattice scores for a lattice + void ComputeLatticeScores(); + + // Prepare lattice : + // 1) Order states in breadth-first search order + // 2) Compute states times, which must be a strictly non-decreasing vector + // 3) Compute lattice alpha and beta scores + void PrepareLattice(); + + const SupervisionLatticeSplitterOptions &opts_; + + const TransitionModel &trans_model_; + + // LatticeInfo object for lattice. + // This will be computed when PrepareLattice function is called. + LatticeInfo lat_scores_; + + // Copy of the lattice. This is required because the lattice states + // need to be ordered in breadth-first search order. + Lattice lat_; +}; + +bool PhoneLatticeToSupervision(const fst::StdVectorFst &tolerance_fst, + const TransitionModel &trans_model, + const Lattice &lat, + chain::Supervision *supervision, + bool debug = false); + +void MakeToleranceEnforcerFst( + const SupervisionOptions &opts, const TransitionModel &trans_model, + fst::StdVectorFst *fst); + +} +} + +#endif // KALDI_CHAIN_CHAIN_SUPERVISION_SPLITTER_H_ diff --git a/src/chain/chain-supervision.cc b/src/chain/chain-supervision.cc index f7b5caf0e17..1a57522b1d1 100644 --- a/src/chain/chain-supervision.cc +++ b/src/chain/chain-supervision.cc @@ -77,6 +77,8 @@ void SupervisionOptions::Check() const { frame_subsampling_factor > 0 && left_tolerance + right_tolerance >= frame_subsampling_factor); + KALDI_ASSERT(lm_scale >= 0.0 && lm_scale < 1.0); + if (!silence_phones_str.empty()) { KALDI_ASSERT(left_tolerance_silence >= 0 && right_tolerance_silence >= 0 && left_tolerance_silence + right_tolerance_silence >= frame_subsampling_factor); @@ -189,7 +191,7 @@ bool PhoneLatticeToProtoSupervisionInternal(const SupervisionOptions &opts, int32 phone = lat_arc.ilabel; // It's an acceptor so ilabel == ollabel. if (phone == 0) { KALDI_WARN << "CompactLattice has epsilon arc. Unexpected."; - return false; + continue; } proto_supervision->fst.AddArc(state, fst::StdArc(phone, phone, @@ -842,6 +844,5 @@ void GetWeightsForRanges(int32 range_length, } } - } // namespace chain } // namespace kaldi diff --git a/src/chainbin/Makefile b/src/chainbin/Makefile index 096040000eb..51f97ff7c55 100644 --- a/src/chainbin/Makefile +++ b/src/chainbin/Makefile @@ -10,8 +10,8 @@ BINFILES = chain-est-phone-lm chain-get-supervision chain-make-den-fst \ nnet3-chain-get-egs nnet3-chain-copy-egs nnet3-chain-merge-egs \ nnet3-chain-shuffle-egs nnet3-chain-subset-egs \ nnet3-chain-acc-lda-stats nnet3-chain-train nnet3-chain-compute-prob \ - nnet3-chain-combine nnet3-chain-normalize-egs - + nnet3-chain-combine nnet3-chain-normalize-egs \ + nnet3-chain-split-and-get-egs chain-split-lattices OBJFILES = diff --git a/src/chainbin/chain-split-lattices.cc b/src/chainbin/chain-split-lattices.cc new file mode 100644 index 00000000000..d8544cf6ba2 --- /dev/null +++ b/src/chainbin/chain-split-lattices.cc @@ -0,0 +1,185 @@ +// chainbin/chain-split-lattices.cc + +// Copyright 2015 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "base/kaldi-common.h" +#include "util/common-utils.h" +#include "hmm/transition-model.h" +#include "hmm/posterior.h" +#include "chain/chain-supervision-splitter.h" +#include "lat/lattice-functions.h" +#include "nnet3/nnet-example.h" +#include "nnet3/nnet-chain-example.h" +#include "nnet3/nnet-example-utils.h" +#include "fstext/kaldi-fst-io.h" + +namespace kaldi { +namespace nnet3 { + + +/** + This function does all the processing for one utterance, and outputs the + supervision objects to 'example_writer'. Note: if normalization_fst is the + empty FST (with no states), it skips the final stage of egs preparation and + you should do it later with nnet3-chain-normalize-egs. +*/ + +static bool ProcessFile(const chain::SupervisionOptions &sup_opts, + const chain::SupervisionLatticeSplitterOptions &sup_lat_splitter_opts, + const TransitionModel &trans_model, + const Lattice &lat, + const fst::StdVectorFst &tolerance_fst, + const std::string &utt_id, + UtteranceSplitter *utt_splitter, + TableWriter *fst_writer, + bool debug = true) { + std::vector state_times; + + int32 frame_subsampling_factor = utt_splitter->Config().frame_subsampling_factor; + int32 num_frames = LatticeStateTimes(lat, &state_times) * frame_subsampling_factor; + + std::vector chunks; + + utt_splitter->GetChunksForUtterance(num_frames, &chunks); + + if (chunks.empty()) { + KALDI_WARN << "Not producing egs for utterance " << utt_id + << " because it is too short: " + << num_frames << " frames."; + return false; + } + + chain::SupervisionLatticeSplitter sup_lat_splitter( + sup_lat_splitter_opts, trans_model, lat); + + for (size_t c = 0; c < chunks.size(); c++) { + ChunkTimeInfo &chunk = chunks[c]; + + int32 start_frame_subsampled = chunk.first_frame / frame_subsampling_factor, + num_frames_subsampled = chunk.num_frames / frame_subsampling_factor; + + Lattice lat_part; + sup_lat_splitter.GetFrameRange(start_frame_subsampled, + num_frames_subsampled, + &lat_part); + + ScaleLattice(fst::LatticeScale(1.0, 0.0), &lat_part); + + chain::Supervision supervision_part; + chain::PhoneLatticeToSupervision(tolerance_fst, + trans_model, lat_part, + &supervision_part, debug); + + std::ostringstream oss; + oss << utt_id << "-" << start_frame_subsampled << "-" << num_frames_subsampled; + std::string key = oss.str(); + + fst_writer->Write(key, supervision_part.fst); + } + return true; +} + +} // namespace nnet3 +} // namespace kaldi + +int main(int argc, char *argv[]) { + try { + using namespace kaldi; + using namespace kaldi::nnet3; + typedef kaldi::int32 int32; + typedef kaldi::int64 int64; + + const char *usage = + "Split lattices to chain supervision FSTs\n" + "\n" + "Usage: chain-split-lattices [options] " + " \n"; + + ExampleGenerationConfig eg_config; // controls num-frames, + // left/right-context, etc. + chain::SupervisionOptions sup_opts; + + int32 srand_seed = 0; + bool debug = true; + + ParseOptions po(usage); + po.Register("srand", &srand_seed, "Seed for random number generator "); + po.Register("debug", &debug, "Get FST before projection"); + + eg_config.Register(&po); + + ParseOptions supervision_opts("supervision", &po); + sup_opts.Register(&supervision_opts); + + chain::SupervisionLatticeSplitterOptions sup_lat_splitter_opts; + sup_lat_splitter_opts.Register(&po); + + po.Read(argc, argv); + + srand(srand_seed); + + if (po.NumArgs() != 3) { + po.PrintUsage(); + exit(1); + } + + std::string + trans_model_rxfilename, + lattice_rspecifier, fst_wspecifier; + trans_model_rxfilename = po.GetArg(1); + lattice_rspecifier = po.GetArg(2); + fst_wspecifier = po.GetArg(3); + + eg_config.ComputeDerived(); + UtteranceSplitter utt_splitter(eg_config); + + TransitionModel trans_model; + ReadKaldiObject(trans_model_rxfilename, &trans_model); + + SequentialLatticeReader lattice_reader(lattice_rspecifier); + TableWriter fst_writer(fst_wspecifier); + + int32 num_err = 0; + + fst::StdVectorFst tolerance_fst; + MakeToleranceEnforcerFst(sup_opts, trans_model, &tolerance_fst); + if (GetVerboseLevel() > 3) WriteFstKaldi(KALDI_LOG, false, tolerance_fst); + + fst::ArcSort(&tolerance_fst, fst::ILabelCompare()); + + for (; !lattice_reader.Done(); lattice_reader.Next()) { + std::string key = lattice_reader.Key(); + const Lattice &lat = lattice_reader.Value(); + if (!ProcessFile(sup_opts, sup_lat_splitter_opts, + trans_model, lat, tolerance_fst, + key, &utt_splitter, &fst_writer, debug)) + num_err++; + } + if (num_err > 0) + KALDI_WARN << num_err << " utterances had errors and could " + "not be processed."; + // utt_splitter prints stats in its destructor. + return utt_splitter.ExitStatus(); + } catch(const std::exception &e) { + std::cerr << e.what() << '\n'; + return -1; + } +} + diff --git a/src/hmm/hmm-test-utils.cc b/src/hmm/hmm-test-utils.cc index ceca116c828..c9b29ce24af 100644 --- a/src/hmm/hmm-test-utils.cc +++ b/src/hmm/hmm-test-utils.cc @@ -23,10 +23,10 @@ namespace kaldi { -TransitionModel *GenRandTransitionModel(ContextDependency **ctx_dep_out) { +TransitionModel *GenRandTransitionModel(ContextDependency **ctx_dep_out, int32 max_phone) { std::vector phones; phones.push_back(1); - for (int32 i = 2; i < 20; i++) + for (int32 i = 2; i <= max_phone; i++) if (rand() % 2 == 0) phones.push_back(i); int32 N = 2 + rand() % 2, // context-size N is 2 or 3. diff --git a/src/hmm/hmm-test-utils.h b/src/hmm/hmm-test-utils.h index 495ebf278ae..148ac44c1be 100644 --- a/src/hmm/hmm-test-utils.h +++ b/src/hmm/hmm-test-utils.h @@ -33,7 +33,7 @@ namespace kaldi { // This function returns a randomly generated TransitionModel object. // If 'ctx_dep' is not NULL, it outputs to *ctx_dep a pointer to the // tree that was used to generate the transition model. -TransitionModel *GenRandTransitionModel(ContextDependency **ctx_dep); +TransitionModel *GenRandTransitionModel(ContextDependency **ctx_dep, int32 max_phone = 19); /// This function returns a HmmTopology object giving a normal 3-state topology, /// covering all phones in the list "phones". This is mainly of use in testing diff --git a/src/lat/lattice-functions.cc b/src/lat/lattice-functions.cc index b04b23702fb..94c12343e5a 100644 --- a/src/lat/lattice-functions.cc +++ b/src/lat/lattice-functions.cc @@ -421,7 +421,7 @@ void LatticeActivePhones(const Lattice &lat, const TransitionModel &trans, } void ConvertLatticeToPhones(const TransitionModel &trans, - Lattice *lat) { + Lattice *lat, bool replace_words) { typedef LatticeArc Arc; int32 num_states = lat->NumStates(); for (int32 state = 0; state < num_states; state++) { @@ -431,9 +431,13 @@ void ConvertLatticeToPhones(const TransitionModel &trans, arc.olabel = 0; // remove any word. if ((arc.ilabel != 0) // has a transition-id on input.. && (trans.TransitionIdToHmmState(arc.ilabel) == 0) - && (!trans.IsSelfLoop(arc.ilabel))) + && (!trans.IsSelfLoop(arc.ilabel))) { // && trans.IsFinal(arc.ilabel)) // there is one of these per phone... - arc.olabel = trans.TransitionIdToPhone(arc.ilabel); + if (replace_words) + arc.olabel = trans.TransitionIdToPhone(arc.ilabel); + else + arc.ilabel = trans.TransitionIdToPhone(arc.ilabel); + } aiter.SetValue(arc); } // end looping over arcs } // end looping over states @@ -1646,4 +1650,110 @@ void ComposeCompactLatticeDeterministic( fst::Connect(composed_clat); } + +void ComputeAcousticScoresMap( + const Lattice &lat, + unordered_map, std::pair, + PairHasher > *acoustic_scores) { + // typedef the arc, weight types + typedef Lattice::Arc Arc; + typedef Arc::Weight LatticeWeight; + typedef Arc::StateId StateId; + + acoustic_scores->clear(); + + std::vector state_times; + LatticeStateTimes(lat, &state_times); + + KALDI_ASSERT(lat.Start() == 0); + + for (StateId s = 0; s < lat.NumStates(); s++) { + int32 t = state_times[s]; + for (fst::ArcIterator aiter(lat, s); !aiter.Done(); + aiter.Next()) { + const Arc &arc = aiter.Value(); + const LatticeWeight &weight = arc.weight; + + int32 tid = arc.ilabel; + + if (tid != 0) { + unordered_map, std::pair, + PairHasher >::iterator it = acoustic_scores->find(std::make_pair(t, tid)); + if (it == acoustic_scores->end()) { + acoustic_scores->insert(std::make_pair(std::make_pair(t, tid), + std::make_pair(weight.Value2(), 1))); + } else { + if (it->second.second == 2 + && it->second.first / it->second.second != weight.Value2()) { + KALDI_VLOG(2) << "Transitions on the same frame have different " + << "acoustic costs for tid " << tid << "; " + << it->second.first / it->second.second + << " vs " << weight.Value2(); + } + it->second.first += weight.Value2(); + it->second.second++; + } + } else { + // Arcs with epsilon input label (tid) must have 0 acoustic cost + KALDI_ASSERT(weight.Value2() == 0); + } + } + + LatticeWeight f = lat.Final(s); + if (f != LatticeWeight::Zero()) { + // Final acoustic cost must be 0 as we are reading from + // non-determinized, non-compact lattice + KALDI_ASSERT(f.Value2() == 0.0); + } + } +} + +void ReplaceAcousticScoresFromMap( + const unordered_map, std::pair, + PairHasher > &acoustic_scores, + Lattice *lat) { + // typedef the arc, weight types + typedef Lattice::Arc Arc; + typedef Arc::Weight LatticeWeight; + typedef Arc::StateId StateId; + + fst::TopSort(lat); + + std::vector state_times; + LatticeStateTimes(*lat, &state_times); + + KALDI_ASSERT(lat->Start() == 0); + + for (StateId s = 0; s < lat->NumStates(); s++) { + int32 t = state_times[s]; + for (fst::MutableArcIterator aiter(lat, s); + !aiter.Done(); aiter.Next()) { + Arc arc(aiter.Value()); + + int32 tid = arc.ilabel; + if (tid != 0) { + unordered_map, std::pair, + PairHasher >::const_iterator it = acoustic_scores.find(std::make_pair(t, tid)); + if (it == acoustic_scores.end()) { + KALDI_ERR << "Could not find tid " << tid << " at time " << t + << " in the acoustic scores map."; + } else { + arc.weight.SetValue2(it->second.first / it->second.second); + } + } else { + // For epsilon arcs, set acoustic cost to 0.0 + arc.weight.SetValue2(0.0); + } + aiter.SetValue(arc); + } + + LatticeWeight f = lat->Final(s); + if (f != LatticeWeight::Zero()) { + // Set final acoustic cost to 0.0 + f.SetValue2(0.0); + lat->SetFinal(s, f); + } + } +} + } // namespace kaldi diff --git a/src/lat/lattice-functions.h b/src/lat/lattice-functions.h index 46390cdeef3..dcea6495e0a 100644 --- a/src/lat/lattice-functions.h +++ b/src/lat/lattice-functions.h @@ -152,7 +152,7 @@ void LatticeActivePhones(const Lattice &lat, const TransitionModel &trans, /// we do reorder). /// Also see PhoneAlignLattice, in phone-align-lattice.h. void ConvertLatticeToPhones(const TransitionModel &trans_model, - Lattice *lat); + Lattice *lat, bool replace_words = true); /// Prunes a lattice or compact lattice. Returns true on success, false if /// there was some kind of failure. @@ -377,6 +377,16 @@ void ComposeCompactLatticeDeterministic( fst::DeterministicOnDemandFst* det_fst, CompactLattice* composed_clat); +void ComputeAcousticScoresMap( + const Lattice &lat, + unordered_map, std::pair, + PairHasher > *acoustic_scores); + +void ReplaceAcousticScoresFromMap( + const unordered_map, std::pair, + PairHasher > &acoustic_scores, + Lattice *lat); + } // namespace kaldi #endif // KALDI_LAT_LATTICE_FUNCTIONS_H_ diff --git a/src/latbin/Makefile b/src/latbin/Makefile index 3885db0af71..fb9b3f5e71d 100644 --- a/src/latbin/Makefile +++ b/src/latbin/Makefile @@ -22,7 +22,7 @@ BINFILES = lattice-best-path lattice-prune lattice-equivalent lattice-to-nbest \ lattice-determinize-phone-pruned-parallel lattice-expand-ngram \ lattice-lmrescore-const-arpa lattice-lmrescore-rnnlm nbest-to-prons \ lattice-arc-post lattice-determinize-non-compact \ - lattice-determinize-pruned-non-compact + lattice-determinize-phone-pruned-non-compact lattice-top-sort OBJFILES = diff --git a/src/latbin/lattice-align-phones.cc b/src/latbin/lattice-align-phones.cc index 9367fb1f3a7..9f9a11575dc 100644 --- a/src/latbin/lattice-align-phones.cc +++ b/src/latbin/lattice-align-phones.cc @@ -43,8 +43,10 @@ int main(int argc, char *argv[]) { " lattice-1best | nbest-to-prons\n"; ParseOptions po(usage); + bool write_compact = true; bool output_if_error = true; - + + po.Register("write-compact", &write_compact, "If true, write in normal (compact) form."); po.Register("output-error-lats", &output_if_error, "Output lattices that aligned " "with errors (e.g. due to force-out"); @@ -66,17 +68,44 @@ int main(int argc, char *argv[]) { TransitionModel tmodel; ReadKaldiObject(model_rxfilename, &tmodel); - SequentialCompactLatticeReader clat_reader(lats_rspecifier); - CompactLatticeWriter clat_writer(lats_wspecifier); + SequentialCompactLatticeReader clat_reader; + CompactLatticeWriter clat_writer; + SequentialLatticeReader lat_reader; + LatticeWriter lat_writer; + + if (write_compact) { + clat_reader.Open(lats_rspecifier); + clat_writer.Open(lats_wspecifier); + } else { + lat_reader.Open(lats_rspecifier); + lat_writer.Open(lats_wspecifier); + } int32 num_done = 0, num_err = 0; - for (; !clat_reader.Done(); clat_reader.Next()) { - std::string key = clat_reader.Key(); - const CompactLattice &clat = clat_reader.Value(); + for (; write_compact ? !clat_reader.Done() : !lat_reader.Done(); + write_compact ? clat_reader.Next() : lat_reader.Next()) { + std::string key = write_compact ? clat_reader.Key() : lat_reader.Key(); + + // Compute a map from each (t, tid) to (sum_of_acoustic_scores, count) + unordered_map, std::pair, + PairHasher > acoustic_scores; CompactLattice aligned_clat; - bool ok = PhoneAlignLattice(clat, tmodel, opts, &aligned_clat); + bool ok; + if (write_compact) { + const CompactLattice &clat = clat_reader.Value(); + + ok = PhoneAlignLattice(clat, tmodel, opts, &aligned_clat); + } else { + const Lattice &lat = lat_reader.Value(); + ComputeAcousticScoresMap(lat, &acoustic_scores); + + CompactLattice clat; + fst::ConvertLattice(lat, &clat); + + ok = PhoneAlignLattice(clat, tmodel, opts, &aligned_clat); + } if (!ok) { num_err++; @@ -86,7 +115,18 @@ int main(int argc, char *argv[]) { if (aligned_clat.Start() != fst::kNoStateId) { KALDI_LOG << "Outputting partial lattice for " << key; TopSortCompactLatticeIfNeeded(&aligned_clat); - clat_writer.Write(key, aligned_clat); + + if (write_compact) { + clat_writer.Write(key, aligned_clat); + } else { + Lattice out_lat; + fst::ConvertLattice(aligned_clat, &out_lat); + + // Replace each arc (t, tid) with the averaged acoustic score from + // the computed map + ReplaceAcousticScoresFromMap(acoustic_scores, &out_lat); + lat_writer.Write(key, out_lat); + } } } } else { @@ -97,7 +137,18 @@ int main(int argc, char *argv[]) { num_done++; KALDI_VLOG(2) << "Aligned lattice for " << key; TopSortCompactLatticeIfNeeded(&aligned_clat); - clat_writer.Write(key, aligned_clat); + + if (write_compact) { + clat_writer.Write(key, aligned_clat); + } else { + Lattice out_lat; + fst::ConvertLattice(aligned_clat, &out_lat); + + // Replace each arc (t, tid) with the averaged acoustic score from + // the computed map + ReplaceAcousticScoresFromMap(acoustic_scores, &out_lat); + lat_writer.Write(key, out_lat); + } } } } diff --git a/src/latbin/lattice-determinize-phone-pruned-non-compact.cc b/src/latbin/lattice-determinize-phone-pruned-non-compact.cc new file mode 100644 index 00000000000..8b528348914 --- /dev/null +++ b/src/latbin/lattice-determinize-phone-pruned-non-compact.cc @@ -0,0 +1,139 @@ +// latbin/lattice-determinize-phoned-pruned-non-compact.cc + +// Copyright 2014 Guoguo Chen +// 2017 Vimal Manohar + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. +#include "base/kaldi-common.h" +#include "hmm/transition-model.h" +#include "lat/kaldi-lattice.h" +#include "lat/determinize-lattice-pruned.h" +#include "lat/lattice-functions.h" +#include "lat/push-lattice.h" +#include "util/common-utils.h" + +int main(int argc, char *argv[]) { + try { + using namespace kaldi; + typedef kaldi::int32 int32; + + const char *usage = + "Determinize lattices, keeping only the best path (sequence of\n" + "acoustic states) for each input-symbol sequence. This version does\n" + "phone inertion when doing a first pass determinization, it then\n" + "removes the inserted symbols and does a second pass determinization.\n" + "It also does pruning as part of the determinization algorithm, which\n" + "is more efficient and prevents blowup.\n" + "This version retains the acoustic scores on the arcs and writes the " + "output as a regular lattice.\n" + "\n" + "Usage: lattice-determinize-phone-pruned-non-compact [options] \\\n" + " \n" + " e.g.: lattice-determinize-phone-pruned-non-compact --acoustic-scale=0.1 \\\n" + " final.mdl ark:in.lats ark:det.lats\n"; + + ParseOptions po(usage); + BaseFloat acoustic_scale = 1.0; + BaseFloat beam = 10.0; + fst::DeterminizeLatticePhonePrunedOptions opts; + + po.Register("acoustic-scale", &acoustic_scale, "Scaling factor for acoustic" + " likelihoods."); + po.Register("beam", &beam, "Pruning beam [applied after acoustic scaling]."); + opts.Register(&po); + po.Read(argc, argv); + + if (po.NumArgs() != 3) { + po.PrintUsage(); + exit(1); + } + + std::string model_rxfilename = po.GetArg(1), + lats_rspecifier = po.GetArg(2), + lats_wspecifier = po.GetArg(3); + + TransitionModel trans_model; + ReadKaldiObject(model_rxfilename, &trans_model); + + SequentialLatticeReader lat_reader(lats_rspecifier); + + LatticeWriter lat_writer(lats_wspecifier); + + int32 n_done = 0, n_warn = 0; + + // depth stats (for diagnostics). + double sum_depth_in = 0.0, + sum_depth_out = 0.0, sum_t = 0.0; + + if (acoustic_scale == 0.0) + KALDI_ERR << "Do not use a zero acoustic scale (cannot be inverted)"; + + for (; !lat_reader.Done(); lat_reader.Next()) { + std::string key = lat_reader.Key(); + Lattice lat = lat_reader.Value(); + lat_reader.FreeCurrent(); + + KALDI_VLOG(2) << "Processing lattice " << key; + + fst::ScaleLattice(fst::AcousticLatticeScale(acoustic_scale), &lat); + + // Compute a map from each (t, tid) to (sum_of_acoustic_scores, count) + unordered_map, std::pair, + PairHasher > acoustic_scores; + ComputeAcousticScoresMap(lat, &acoustic_scores); + + CompactLattice det_clat; + if (!DeterminizeLatticePhonePrunedWrapper( + trans_model, &lat, beam, &det_clat, opts)) { + KALDI_WARN << "For key " << key << ", determinization did not succeed" + "(partial output will be pruned tighter than the specified beam.)"; + n_warn++; + } + + int32 t; + TopSortCompactLatticeIfNeeded(&det_clat); + double depth = CompactLatticeDepth(det_clat, &t); + sum_depth_in += lat.NumStates(); + sum_depth_out += depth * t; + sum_t += t; + + Lattice out_lat; + fst::ConvertLattice(det_clat, &out_lat); + + // Replace each arc (t, tid) with the averaged acoustic score from + // the computed map + ReplaceAcousticScoresFromMap(acoustic_scores, &out_lat); + + fst::ScaleLattice(fst::AcousticLatticeScale(1.0/acoustic_scale), &out_lat); + lat_writer.Write(key, out_lat); + n_done++; + } + + if (sum_t != 0.0) { + KALDI_LOG << "Average input-lattice depth (measured at at state level) is " + << (sum_depth_in / sum_t) << ", output depth is " + << (sum_depth_out / sum_t) << ", over " << sum_t << " frames " + << " (average num-frames = " << (sum_t / n_done) << ")."; + } + KALDI_LOG << "Done " << n_done << " lattices, determinization finished " + << "earlier than specified by the beam (or output was empty) on " + << n_warn << " of these."; + return (n_done != 0 ? 0 : 1); + } catch(const std::exception &e) { + std::cerr << e.what(); + return -1; + } +} diff --git a/src/latbin/lattice-determinize-pruned-non-compact.cc b/src/latbin/lattice-determinize-pruned-non-compact.cc index bd70032a3c0..edf0fac213c 100644 --- a/src/latbin/lattice-determinize-pruned-non-compact.cc +++ b/src/latbin/lattice-determinize-pruned-non-compact.cc @@ -25,108 +25,6 @@ #include "lat/push-lattice.h" #include "lat/minimize-lattice.h" -namespace kaldi { - -typedef Lattice::StateId StateId; -typedef Lattice::Arc Arc; - -void ComputeAcousticScoresMap( - const Lattice &lat, - unordered_map, std::pair, - PairHasher > *acoustic_scores) { - acoustic_scores->clear(); - - std::vector state_times; - LatticeStateTimes(lat, &state_times); - - KALDI_ASSERT(lat.Start() == 0); - - for (StateId s = 0; s < lat.NumStates(); s++) { - int32 t = state_times[s]; - for (fst::ArcIterator aiter(lat, s); !aiter.Done(); - aiter.Next()) { - const Arc &arc = aiter.Value(); - const LatticeWeight &weight = arc.weight; - - int32 tid = arc.ilabel; - - if (tid != 0) { - unordered_map, std::pair, - PairHasher >::iterator it = acoustic_scores->find(std::make_pair(t, tid)); - if (it == acoustic_scores->end()) { - acoustic_scores->insert(std::make_pair(std::make_pair(t, tid), - std::make_pair(weight.Value2(), 1))); - } else { - if (it->second.second == 2 - && it->second.first / it->second.second != weight.Value2()) { - KALDI_VLOG(2) << "Transitions on the same frame have different " - << "acoustic costs for tid " << tid << "; " - << it->second.first / it->second.second - << " vs " << weight.Value2(); - } - it->second.first += weight.Value2(); - it->second.second++; - } - } else { - // Arcs with epsilon input label (tid) must have 0 acoustic cost - KALDI_ASSERT(weight.Value2() == 0); - } - } - - LatticeWeight f = lat.Final(s); - if (f != LatticeWeight::Zero()) { - // Final acoustic cost must be 0 as we are reading from - // non-determinized, non-compact lattice - KALDI_ASSERT(f.Value2() == 0.0); - } - } -} - -void ReplaceAcousticScoresFromMap( - const unordered_map, std::pair, - PairHasher > &acoustic_scores, - Lattice *lat) { - fst::TopSort(lat); - - std::vector state_times; - LatticeStateTimes(*lat, &state_times); - - KALDI_ASSERT(lat->Start() == 0); - - for (StateId s = 0; s < lat->NumStates(); s++) { - int32 t = state_times[s]; - for (fst::MutableArcIterator aiter(lat, s); - !aiter.Done(); aiter.Next()) { - Arc arc(aiter.Value()); - - int32 tid = arc.ilabel; - if (tid != 0) { - unordered_map, std::pair, - PairHasher >::const_iterator it = acoustic_scores.find(std::make_pair(t, tid)); - if (it == acoustic_scores.end()) { - KALDI_ERR << "Could not find tid " << tid << " at time " << t - << " in the acoustic scores map."; - } else { - arc.weight.SetValue2(it->second.first / it->second.second); - } - } else { - // For epsilon arcs, set acoustic cost to 0.0 - arc.weight.SetValue2(0.0); - } - aiter.SetValue(arc); - } - - LatticeWeight f = lat->Final(s); - if (f != LatticeWeight::Zero()) { - // Set final acoustic cost to 0.0 - f.SetValue2(0.0); - lat->SetFinal(s, f); - } - } -} - -} // end namespace kaldi - int main(int argc, char *argv[]) { try { using namespace kaldi; diff --git a/src/latbin/lattice-lmrescore-const-arpa.cc b/src/latbin/lattice-lmrescore-const-arpa.cc index 789f0fb8d4e..bd5f9c16cf7 100644 --- a/src/latbin/lattice-lmrescore-const-arpa.cc +++ b/src/latbin/lattice-lmrescore-const-arpa.cc @@ -44,8 +44,10 @@ int main(int argc, char *argv[]) { " const_arpa ark:out.lats\n"; ParseOptions po(usage); + bool write_compact = true; BaseFloat lm_scale = 1.0; + po.Register("write-compact", &write_compact, "If true, write in normal (compact) form."); po.Register("lm-scale", &lm_scale, "Scaling factor for language model " "costs; frequently 1.0 or -1.0"); @@ -65,14 +67,45 @@ int main(int argc, char *argv[]) { ReadKaldiObject(lm_rxfilename, &const_arpa); // Reads and writes as compact lattice. - SequentialCompactLatticeReader compact_lattice_reader(lats_rspecifier); - CompactLatticeWriter compact_lattice_writer(lats_wspecifier); + SequentialCompactLatticeReader compact_lattice_reader; + CompactLatticeWriter compact_lattice_writer; + + SequentialLatticeReader lattice_reader; + LatticeWriter lattice_writer; + + if (write_compact) { + compact_lattice_reader.Open(lats_rspecifier); + compact_lattice_writer.Open(lats_wspecifier); + } else { + lattice_reader.Open(lats_rspecifier); + lattice_writer.Open(lats_wspecifier); + } int32 n_done = 0, n_fail = 0; - for (; !compact_lattice_reader.Done(); compact_lattice_reader.Next()) { - std::string key = compact_lattice_reader.Key(); - CompactLattice clat = compact_lattice_reader.Value(); - compact_lattice_reader.FreeCurrent(); + for (; write_compact ? !compact_lattice_reader.Done() : !lattice_reader.Done(); + write_compact ? compact_lattice_reader.Next() : lattice_reader.Next()) { + std::string key = write_compact ? compact_lattice_reader.Key() : lattice_reader.Key(); + + // Compute a map from each (t, tid) to (sum_of_acoustic_scores, count) + unordered_map, std::pair, + PairHasher > acoustic_scores; + + CompactLattice clat; + if (write_compact) { + clat = compact_lattice_reader.Value(); + compact_lattice_reader.FreeCurrent(); + } else { + const Lattice &lat = lattice_reader.Value(); + + if (lm_scale == 0.0) { + lattice_writer.Write(key, lat); + continue; + } + + ComputeAcousticScoresMap(lat, &acoustic_scores); + fst::ConvertLattice(lat, &clat); + lattice_reader.FreeCurrent(); + } if (lm_scale != 0.0) { // Before composing with the LM FST, we scale the lattice weights @@ -104,12 +137,23 @@ int main(int argc, char *argv[]) { << " (incompatible LM?)"; n_fail++; } else { - compact_lattice_writer.Write(key, determinized_clat); + if (write_compact) { + compact_lattice_writer.Write(key, determinized_clat); + } else { + Lattice out_lat; + fst::ConvertLattice(determinized_clat, &out_lat); + + // Replace each arc (t, tid) with the averaged acoustic score from + // the computed map + ReplaceAcousticScoresFromMap(acoustic_scores, &out_lat); + lattice_writer.Write(key, out_lat); + } n_done++; } } else { // Zero scale so nothing to do. n_done++; + compact_lattice_writer.Write(key, clat); } } diff --git a/src/latbin/lattice-lmrescore.cc b/src/latbin/lattice-lmrescore.cc index 2e5406f75de..03395e68afb 100644 --- a/src/latbin/lattice-lmrescore.cc +++ b/src/latbin/lattice-lmrescore.cc @@ -24,6 +24,7 @@ #include "fstext/fstext-lib.h" #include "fstext/kaldi-fst-io.h" #include "lat/kaldi-lattice.h" +#include "lat/lattice-functions.h" int main(int argc, char *argv[]) { try { @@ -43,9 +44,11 @@ int main(int argc, char *argv[]) { " e.g.: lattice-lmrescore --lm-scale=-1.0 ark:in.lats 'fstproject --project_output=true data/lang/G.fst|' ark:out.lats\n"; ParseOptions po(usage); + bool write_compact = true; BaseFloat lm_scale = 1.0; int32 num_states_cache = 50000; + po.Register("write-compact", &write_compact, "If true, write in normal (compact) form."); po.Register("lm-scale", &lm_scale, "Scaling factor for language model costs; frequently 1.0 or -1.0"); po.Register("num-states-cache", &num_states_cache, "Number of states we cache when mapping LM FST to lattice type. " @@ -100,7 +103,13 @@ int main(int argc, char *argv[]) { SequentialLatticeReader lattice_reader(lats_rspecifier); // Write as compact lattice. - CompactLatticeWriter compact_lattice_writer(lats_wspecifier); + CompactLatticeWriter compact_lattice_writer; + LatticeWriter lattice_writer; + + if (write_compact) + compact_lattice_writer.Open(lats_wspecifier); + else + lattice_writer.Open(lats_wspecifier); int32 n_done = 0, n_fail = 0; @@ -116,6 +125,12 @@ int main(int argc, char *argv[]) { // right effect (taking the "best path" through the LM) regardless // of the sign of lm_scale. fst::ScaleLattice(fst::GraphLatticeScale(1.0 / lm_scale), &lat); + // Compute a map from each (t, tid) to (sum_of_acoustic_scores, count) + unordered_map, std::pair, + PairHasher > acoustic_scores; + if (!write_compact) + ComputeAcousticScoresMap(lat, &acoustic_scores); + ArcSort(&lat, fst::OLabelCompare()); Lattice composed_lat; @@ -126,6 +141,7 @@ int main(int argc, char *argv[]) { TableCompose(lat, lm_fst, &composed_lat, &lm_compose_cache); Invert(&composed_lat); // make it so word labels are on the input. + CompactLattice determinized_lat; DeterminizeLattice(composed_lat, &determinized_lat); fst::ScaleLattice(fst::GraphLatticeScale(lm_scale), &determinized_lat); @@ -133,15 +149,30 @@ int main(int argc, char *argv[]) { KALDI_WARN << "Empty lattice for utterance " << key << " (incompatible LM?)"; n_fail++; } else { - compact_lattice_writer.Write(key, determinized_lat); + if (write_compact) { + compact_lattice_writer.Write(key, determinized_lat); + } else { + Lattice out_lat; + fst::ConvertLattice(determinized_lat, &out_lat); + + // Replace each arc (t, tid) with the averaged acoustic score from + // the computed map + ReplaceAcousticScoresFromMap(acoustic_scores, &out_lat); + lattice_writer.Write(key, out_lat); + } n_done++; } } else { // zero scale so nothing to do. n_done++; - CompactLattice compact_lat; - ConvertLattice(lat, &compact_lat); - compact_lattice_writer.Write(key, compact_lat); + + if (write_compact) { + CompactLattice compact_lat; + ConvertLattice(lat, &compact_lat); + compact_lattice_writer.Write(key, compact_lat); + } else { + lattice_writer.Write(key, lat); + } } } diff --git a/src/latbin/lattice-prune.cc b/src/latbin/lattice-prune.cc index 49399f748e4..993eea41145 100644 --- a/src/latbin/lattice-prune.cc +++ b/src/latbin/lattice-prune.cc @@ -40,10 +40,12 @@ int main(int argc, char *argv[]) { " e.g.: lattice-prune --acoustic-scale=0.1 --beam=4.0 ark:1.lats ark:pruned.lats\n"; ParseOptions po(usage); + bool write_compact = true; BaseFloat acoustic_scale = 1.0; BaseFloat inv_acoustic_scale = 1.0; BaseFloat beam = 10.0; + po.Register("write-compact", &write_compact, "If true, write in normal (compact) form."); po.Register("acoustic-scale", &acoustic_scale, "Scaling factor for acoustic likelihoods"); po.Register("inv-acoustic-scale", &inv_acoustic_scale, "An alternative way of setting the " "acoustic scale: you can set its inverse."); @@ -63,10 +65,18 @@ int main(int argc, char *argv[]) { std::string lats_rspecifier = po.GetArg(1), lats_wspecifier = po.GetArg(2); - + SequentialCompactLatticeReader compact_lattice_reader; + CompactLatticeWriter compact_lattice_writer; + SequentialLatticeReader lattice_reader; + LatticeWriter lattice_writer; - SequentialCompactLatticeReader compact_lattice_reader(lats_rspecifier); - CompactLatticeWriter compact_lattice_writer(lats_wspecifier); + if (write_compact) { + compact_lattice_reader.Open(lats_rspecifier); + compact_lattice_writer.Open(lats_wspecifier); + } else { + lattice_reader.Open(lats_rspecifier); + lattice_writer.Open(lats_wspecifier); + } int32 n_done = 0, n_err = 0; int64 n_arcs_in = 0, n_arcs_out = 0, @@ -75,10 +85,25 @@ int main(int argc, char *argv[]) { if (acoustic_scale == 0.0) KALDI_ERR << "Do not use a zero acoustic scale (cannot be inverted)"; - for (; !compact_lattice_reader.Done(); compact_lattice_reader.Next()) { - std::string key = compact_lattice_reader.Key(); - CompactLattice clat = compact_lattice_reader.Value(); - compact_lattice_reader.FreeCurrent(); + for (; write_compact ? !compact_lattice_reader.Done() : !lattice_reader.Done(); + write_compact ? compact_lattice_reader.Next() : lattice_reader.Next()) { + std::string key = write_compact ? compact_lattice_reader.Key() : lattice_reader.Key(); + + // Compute a map from each (t, tid) to (sum_of_acoustic_scores, count) + unordered_map, std::pair, + PairHasher > acoustic_scores; + + CompactLattice clat; + if (write_compact) { + clat = compact_lattice_reader.Value(); + compact_lattice_reader.FreeCurrent(); + } else { + const Lattice &lat = lattice_reader.Value(); + ComputeAcousticScoresMap(lat, &acoustic_scores); + + fst::ConvertLattice(lat, &clat); + lattice_reader.FreeCurrent(); + } fst::ScaleLattice(fst::AcousticLatticeScale(acoustic_scale), &clat); int64 narcs = NumArcs(clat), nstates = clat.NumStates(); n_arcs_in += narcs; @@ -96,7 +121,18 @@ int main(int argc, char *argv[]) { << nstates << " to " << pruned_nstates << " and #arcs from " << narcs << " to " << pruned_narcs; fst::ScaleLattice(fst::AcousticLatticeScale(1.0/acoustic_scale), &pruned_clat); - compact_lattice_writer.Write(key, pruned_clat); + + if (write_compact) { + compact_lattice_writer.Write(key, pruned_clat); + } else { + Lattice out_lat; + fst::ConvertLattice(pruned_clat, &out_lat); + + // Replace each arc (t, tid) with the averaged acoustic score from + // the computed map + ReplaceAcousticScoresFromMap(acoustic_scores, &out_lat); + lattice_writer.Write(key, out_lat); + } n_done++; } diff --git a/src/latbin/lattice-scale.cc b/src/latbin/lattice-scale.cc index 5ca6012d994..58a0d2fb372 100644 --- a/src/latbin/lattice-scale.cc +++ b/src/latbin/lattice-scale.cc @@ -39,12 +39,14 @@ int main(int argc, char *argv[]) { " e.g.: lattice-scale --lm-scale=0.0 ark:1.lats ark:scaled.lats\n"; ParseOptions po(usage); + bool write_compact = true; BaseFloat acoustic_scale = 1.0; BaseFloat inv_acoustic_scale = 1.0; BaseFloat lm_scale = 1.0; BaseFloat acoustic2lm_scale = 0.0; BaseFloat lm2acoustic_scale = 0.0; + po.Register("write-compact", &write_compact, "If true, write in normal (compact) form."); po.Register("acoustic-scale", &acoustic_scale, "Scaling factor for acoustic likelihoods"); po.Register("inv-acoustic-scale", &inv_acoustic_scale, "An alternative way " "of setting the acoustic scale: you can set its inverse."); @@ -61,14 +63,9 @@ int main(int argc, char *argv[]) { std::string lats_rspecifier = po.GetArg(1), lats_wspecifier = po.GetArg(2); - - SequentialCompactLatticeReader compact_lattice_reader(lats_rspecifier); - - // Write as compact lattice. - CompactLatticeWriter compact_lattice_writer(lats_wspecifier); - + int32 n_done = 0; - + KALDI_ASSERT(acoustic_scale == 1.0 || inv_acoustic_scale == 1.0); if (inv_acoustic_scale != 1.0) acoustic_scale = 1.0 / inv_acoustic_scale; @@ -81,12 +78,32 @@ int main(int argc, char *argv[]) { scale[1][0] = lm2acoustic_scale; scale[1][1] = acoustic_scale; - for (; !compact_lattice_reader.Done(); compact_lattice_reader.Next()) { - CompactLattice lat = compact_lattice_reader.Value(); - ScaleLattice(scale, &lat); - compact_lattice_writer.Write(compact_lattice_reader.Key(), lat); - n_done++; + if (write_compact) { + SequentialCompactLatticeReader compact_lattice_reader(lats_rspecifier); + + // Write as compact lattice. + CompactLatticeWriter compact_lattice_writer(lats_wspecifier); + + for (; !compact_lattice_reader.Done(); compact_lattice_reader.Next()) { + CompactLattice lat = compact_lattice_reader.Value(); + ScaleLattice(scale, &lat); + compact_lattice_writer.Write(compact_lattice_reader.Key(), lat); + n_done++; + } + } else { + SequentialLatticeReader lattice_reader(lats_rspecifier); + + // Write as regular lattice. + LatticeWriter lattice_writer(lats_wspecifier); + + for (; !lattice_reader.Done(); lattice_reader.Next()) { + Lattice lat = lattice_reader.Value(); + ScaleLattice(scale, &lat); + lattice_writer.Write(lattice_reader.Key(), lat); + n_done++; + } } + KALDI_LOG << "Done " << n_done << " lattices."; return (n_done != 0 ? 0 : 1); } catch(const std::exception &e) { diff --git a/src/latbin/lattice-to-fst.cc b/src/latbin/lattice-to-fst.cc index 0d2ac29a99b..16ef0f60ce6 100644 --- a/src/latbin/lattice-to-fst.cc +++ b/src/latbin/lattice-to-fst.cc @@ -22,6 +22,50 @@ #include "util/common-utils.h" #include "fstext/fstext-lib.h" #include "lat/kaldi-lattice.h" +#include "hmm/transition-model.h" + +namespace kaldi { + +void ConvertLatticeToPdfLabels( + const TransitionModel &tmodel, + const Lattice &ifst, + fst::StdVectorFst *ofst) { + typedef fst::ArcTpl ArcIn; + typedef fst::StdArc ArcOut; + typedef ArcIn::StateId StateId; + ofst->DeleteStates(); + // The states will be numbered exactly the same as the original FST. + // Add the states to the new FST. + StateId num_states = ifst.NumStates(); + for (StateId s = 0; s < num_states; s++) { + StateId news = ofst->AddState(); + assert(news == s); + } + ofst->SetStart(ifst.Start()); + for (StateId s = 0; s < num_states; s++) { + LatticeWeight final_iweight = ifst.Final(s); + if (final_iweight != LatticeWeight::Zero()) { + fst::TropicalWeight final_oweight; + ConvertLatticeWeight(final_iweight, &final_oweight); + ofst->SetFinal(s, final_oweight); + } + for (fst::ArcIterator iter(ifst, s); + !iter.Done(); + iter.Next()) { + ArcIn arc = iter.Value(); + KALDI_PARANOID_ASSERT(arc.weight != LatticeWeight::Zero()); + ArcOut oarc; + ConvertLatticeWeight(arc.weight, &oarc.weight); + oarc.ilabel = tmodel.TransitionIdToPdf(arc.ilabel) + 1; + oarc.olabel = arc.olabel; + oarc.nextstate = arc.nextstate; + ofst->AddArc(s, oarc); + } + } +} + +} + int main(int argc, char *argv[]) { try { @@ -34,8 +78,9 @@ int main(int argc, char *argv[]) { using std::vector; BaseFloat acoustic_scale = 0.0; BaseFloat lm_scale = 0.0; - bool rm_eps = true; - + bool rm_eps = true, read_compact = true, convert_to_pdf_labels = false; + std::string trans_model; + const char *usage = "Turn lattices into normal FSTs, retaining only the word labels\n" "By default, removes all weights and also epsilons (configure with\n" @@ -44,9 +89,14 @@ int main(int argc, char *argv[]) { " e.g.: lattice-to-fst ark:1.lats ark:1.fsts\n"; ParseOptions po(usage); + po.Register("read-compact", &read_compact, "Read compact lattice"); po.Register("acoustic-scale", &acoustic_scale, "Scaling factor for acoustic likelihoods"); po.Register("lm-scale", &lm_scale, "Scaling factor for graph/lm costs"); po.Register("rm-eps", &rm_eps, "Remove epsilons in resulting FSTs (in lazy way; may not remove all)"); + po.Register("convert-to-pdf-labels", &convert_to_pdf_labels, + "Convert lattice to pdf labels"); + po.Register("trans-model", &trans_model, + "Transition model"); po.Read(argc, argv); @@ -60,31 +110,67 @@ int main(int argc, char *argv[]) { std::string lats_rspecifier = po.GetArg(1), fsts_wspecifier = po.GetArg(2); - SequentialCompactLatticeReader lattice_reader(lats_rspecifier); + TransitionModel tmodel; + if (!trans_model.empty()) { + ReadKaldiObject(trans_model, &tmodel); + } + + SequentialCompactLatticeReader compact_lattice_reader; + SequentialLatticeReader lattice_reader; + TableWriter fst_writer(fsts_wspecifier); int32 n_done = 0; // there is no failure mode, barring a crash. - for (; !lattice_reader.Done(); lattice_reader.Next()) { - std::string key = lattice_reader.Key(); - CompactLattice clat = lattice_reader.Value(); - lattice_reader.FreeCurrent(); - ScaleLattice(scale, &clat); // typically scales to zero. - RemoveAlignmentsFromCompactLattice(&clat); // remove the alignments... - fst::VectorFst fst; - { - Lattice lat; - ConvertLattice(clat, &lat); // convert to non-compact form.. won't introduce - // extra states because already removed alignments. - ConvertLattice(lat, &fst); // this adds up the (lm,acoustic) costs to get - // the normal (tropical) costs. - Project(&fst, fst::PROJECT_OUTPUT); // Because in the standard Lattice format, - // the words are on the output, and we want the word labels. + if (read_compact) { + SequentialCompactLatticeReader compact_lattice_reader(lats_rspecifier); + for (; !compact_lattice_reader.Done(); compact_lattice_reader.Next()) { + std::string key = compact_lattice_reader.Key(); + CompactLattice clat = compact_lattice_reader.Value(); + compact_lattice_reader.FreeCurrent(); + ScaleLattice(scale, &clat); // typically scales to zero. + RemoveAlignmentsFromCompactLattice(&clat); // remove the alignments... + fst::VectorFst fst; + { + Lattice lat; + ConvertLattice(clat, &lat); // convert to non-compact form.. won't introduce + // extra states because already removed alignments. + + if (convert_to_pdf_labels) { + ConvertLatticeToPdfLabels(tmodel, lat, &fst); // this adds up the (lm,acoustic) costs to get + // the normal (tropical) costs. + } else { + ConvertLattice(lat, &fst); + } + + Project(&fst, fst::PROJECT_OUTPUT); // Because in the standard compact_lattice format, + // the words are on the output, and we want the word labels. + } + if (rm_eps) RemoveEpsLocal(&fst); + + fst_writer.Write(key, fst); + n_done++; } - if (rm_eps) RemoveEpsLocal(&fst); - - fst_writer.Write(key, fst); - n_done++; + } else { + SequentialLatticeReader lattice_reader(lats_rspecifier); + for (; !lattice_reader.Done(); lattice_reader.Next()) { + std::string key = lattice_reader.Key(); + Lattice lat = lattice_reader.Value(); + lattice_reader.FreeCurrent(); + ScaleLattice(scale, &lat); // typically scales to zero. + fst::VectorFst fst; + if (convert_to_pdf_labels) { + ConvertLatticeToPdfLabels(tmodel, lat, &fst); + } else { + ConvertLattice(lat, &fst); + } + Project(&fst, fst::PROJECT_INPUT); + if (rm_eps) RemoveEpsLocal(&fst); + + fst_writer.Write(key, fst); + n_done++; + } + } KALDI_LOG << "Done converting " << n_done << " lattices to word-level FSTs"; return (n_done != 0 ? 0 : 1); diff --git a/src/latbin/lattice-to-phone-lattice.cc b/src/latbin/lattice-to-phone-lattice.cc index 10da2b47bf1..749435d3bf6 100644 --- a/src/latbin/lattice-to-phone-lattice.cc +++ b/src/latbin/lattice-to-phone-lattice.cc @@ -49,6 +49,8 @@ int main(int argc, char *argv[]) { ParseOptions po(usage); bool replace_words = true; + bool write_compact = true; + po.Register("write-compact", &write_compact, "If true, write in normal (compact) form."); po.Register("replace-words", &replace_words, "If true, replace words with phones; otherwise replace " "transition-ids with phones."); @@ -70,26 +72,37 @@ int main(int argc, char *argv[]) { ReadKaldiObject(model_rxfilename, &trans_model); - SequentialCompactLatticeReader clat_reader(lats_rspecifier); - CompactLatticeWriter clat_writer(lats_wspecifier); // write as compact. - for (; !clat_reader.Done(); clat_reader.Next()) { - if (replace_words) { - Lattice lat; - ConvertLattice(clat_reader.Value(), &lat); - ConvertLatticeToPhones(trans_model, &lat); // this function replaces words -> phones - CompactLattice clat; - ConvertLattice(lat, &clat); - clat_writer.Write(clat_reader.Key(), clat); - } else { // replace transition-ids with phones. - CompactLattice clat(clat_reader.Value()); - ConvertCompactLatticeToPhones(trans_model, &clat); - // this function replaces transition-ids with phones. We do it in the - // CompactLattice form, in order to preserve the alignment of - // transition-id sequences/phones-sequences to words [e.g. if you just - // did lattice-align-words]. - clat_writer.Write(clat_reader.Key(), clat); + if (write_compact) { + SequentialCompactLatticeReader clat_reader(lats_rspecifier); + CompactLatticeWriter clat_writer(lats_wspecifier); + for (; !clat_reader.Done(); !clat_reader.Done()) { + if (replace_words) { + Lattice lat; + ConvertLattice(clat_reader.Value(), &lat); + ConvertLatticeToPhones(trans_model, &lat); // this function replaces words -> phones + CompactLattice clat; + ConvertLattice(lat, &clat); + clat_writer.Write(clat_reader.Key(), clat); + } else { // replace transition-ids with phones. + CompactLattice clat(clat_reader.Value()); + ConvertCompactLatticeToPhones(trans_model, &clat); + // this function replaces transition-ids with phones. We do it in the + // CompactLattice form, in order to preserve the alignment of + // transition-id sequences/phones-sequences to words [e.g. if you just + // did lattice-align-words]. + clat_writer.Write(clat_reader.Key(), clat); + } + n_done++; + } + } else { + SequentialLatticeReader lat_reader(lats_rspecifier); + LatticeWriter lat_writer(lats_wspecifier); + for (; !lat_reader.Done(); !lat_reader.Done()) { + Lattice lat(lat_reader.Value()); + ConvertLatticeToPhones(trans_model, &lat, replace_words); // this function replaces words -> phones + lat_writer.Write(lat_reader.Key(), lat); + n_done++; } - n_done++; } KALDI_LOG << "Done converting " << n_done << " lattices."; return (n_done != 0 ? 0 : 1); From 8772dbaf3493869e25fe26e4e5516f6ad77ff1be Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Tue, 3 Oct 2017 18:18:34 -0400 Subject: [PATCH 071/174] semisup: Adding tolerances to lattices --- .../run_tdnn_15k_semisupervised_conf_o.sh | 445 ++++++++++++++++ .../run_tdnn_15k_semisupervised_conf_p.sh | 454 +++++++++++++++++ .../run_tdnn_15k_semisupervised_conf_q.sh | 473 ++++++++++++++++++ .../run_tdnn_15k_semisupervised_conf_r.sh | 437 ++++++++++++++++ .../run_tdnn_15k_semisupervised_conf_s.sh | 436 ++++++++++++++++ .../run_tdnn_50k_semisupervised_conf_e.sh | 452 +++++++++++++++++ .../run_tdnn_50k_semisupervised_conf_f.sh | 444 ++++++++++++++++ .../run_tdnn_50k_semisupervised_conf_g.sh | 439 ++++++++++++++++ .../s5/local/semisup/run_15k.sh | 2 +- .../nnet3/train/chain_objf/acoustic_model.py | 34 +- egs/wsj/s5/steps/libs/nnet3/train/common.py | 5 + egs/wsj/s5/steps/nnet3/chain/get_egs.sh | 28 +- egs/wsj/s5/steps/nnet3/chain/get_egs_split.sh | 4 +- egs/wsj/s5/steps/nnet3/chain/train.py | 28 +- egs/wsj/s5/steps/nnet3/decode.sh | 9 +- .../steps/nnet3/multilingual/combine_egs.sh | 4 +- .../data/perturb_speed_to_allowed_lengths.py | 243 +++++++++ src/chain/chain-supervision-splitter.cc | 341 +++++++++++-- src/chain/chain-supervision-splitter.h | 137 ++++- src/chainbin/Makefile | 3 +- src/chainbin/chain-split-lattices.cc | 51 +- src/chainbin/nnet3-chain-get-egs.cc | 1 + src/lat/lattice-functions.cc | 14 +- src/latbin/lattice-to-fst.cc | 12 +- src/nnet3/nnet-chain-combine.cc | 1 + src/nnet3/nnet-chain-diagnostics.cc | 37 ++ src/nnet3/nnet-chain-diagnostics.h | 2 + src/nnet3/nnet-chain-training.cc | 37 ++ src/nnet3/nnet-chain-training.h | 2 + src/nnet3/nnet-combine.h | 5 + src/nnet3/nnet-diagnostics.h | 5 + src/nnet3/nnet-example-utils.cc | 37 ++ src/nnet3/nnet-example-utils.h | 6 +- src/nnet3/nnet-training.h | 5 + src/online2bin/extend-wav-with-silence.cc | 60 ++- 35 files changed, 4554 insertions(+), 139 deletions(-) create mode 100644 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_o.sh create mode 100644 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_p.sh create mode 100644 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_q.sh create mode 100644 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_r.sh create mode 100644 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_s.sh create mode 100755 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_e.sh create mode 100755 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_f.sh create mode 100755 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_g.sh create mode 100755 egs/wsj/s5/utils/data/perturb_speed_to_allowed_lengths.py diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_o.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_o.sh new file mode 100644 index 00000000000..e3d1aae047b --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_o.sh @@ -0,0 +1,445 @@ +#!/bin/bash + +# This script is same as _d, but uses a weight of 1.0 for unsupervised set. +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 5,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup15k_250k # for reference +exp=exp/semisup_15k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup15k +semi_affix=semi15k_250k # affix relating train-set splitting proportion + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +lm_opts= + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.0 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=2.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=2 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1e # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix=fg + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +tree_affix=${tree_affix}_${semi_affix} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor + + steps/nnet3/chain/build_tree_multiple_sources.sh \ + --frame-subsampling-factor $frame_subsampling_factor \ + --use-fmllr false \ + --cmd "$train_cmd" 10000 data/lang_chain \ + data/${supervised_set} $sup_ali_dir \ + data/${unsupervised_set} \ + $chaindir/best_path_${unsupervised_set}${decode_affix} \ + $treedir +fi + +dir=$exp/chain${nnet3_affix}/tdnn${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" --lm-opts "$lm_opts" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri3_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 160,140,110,80 \ + --trainer.num-chunk-per-minibatch "150=128/300=64" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${lang_test_suffix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_p.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_p.sh new file mode 100644 index 00000000000..19d17ef8418 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_p.sh @@ -0,0 +1,454 @@ +#!/bin/bash + +# This script is same as _d, but uses a weight of 1.0 for unsupervised set. +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 5,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup15k_250k # for reference +exp=exp/semisup_15k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup15k +semi_affix=semi15k_250k # affix relating train-set splitting proportion + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +lm_opts= +unsup_egs_opts= + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1p # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix=fg + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +tree_affix=${tree_affix}_${semi_affix} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor + + steps/nnet3/chain/build_tree_multiple_sources.sh \ + --frame-subsampling-factor $frame_subsampling_factor \ + --use-fmllr false \ + --cmd "$train_cmd" 10000 data/lang_chain \ + data/${supervised_set} $sup_ali_dir \ + data/${unsupervised_set} \ + $chaindir/best_path_${unsupervised_set}${decode_affix} \ + $treedir +fi + +dir=$exp/chain${nnet3_affix}/tdnn${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" --lm-opts "$lm_opts" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri3_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs_split_and_convert.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + --splitter-opts "--add-partial-unk-label-left --add-partial-unk-label-right" \ + $unsup_egs_opts \ + data/${unsupervised_set}_hires data/lang_chain $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=128/300=64" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${lang_test_suffix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_q.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_q.sh new file mode 100644 index 00000000000..3c452cbde83 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_q.sh @@ -0,0 +1,473 @@ +#!/bin/bash + +# This script is same as _d, but uses a weight of 1.0 for unsupervised set. +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 5,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup15k_250k # for reference +exp=exp/semisup_15k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup15k +semi_affix=semi15k_250k # affix relating train-set splitting proportion + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +lm_opts= + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.0 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=2.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=2 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1q # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix=fg + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size="150=128,64/300=100,64,32/600=50,32,16/1200=16,8" +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +tree_affix=${tree_affix}_${semi_affix} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor + + steps/nnet3/chain/build_tree_multiple_sources.sh \ + --frame-subsampling-factor $frame_subsampling_factor \ + --use-fmllr false \ + --cmd "$train_cmd" 10000 data/lang_chain \ + data/${supervised_set} $sup_ali_dir \ + data/${unsupervised_set} \ + $chaindir/best_path_${unsupervised_set}${decode_affix} \ + $treedir +fi + +dir=$exp/chain${nnet3_affix}/tdnn${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" --lm-opts "$lm_opts" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri3_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp_sil +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --no-chunking true \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires \ + --generate-egs-scp true \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${lang_test_suffix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_r.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_r.sh new file mode 100644 index 00000000000..e7179b3bc76 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_r.sh @@ -0,0 +1,437 @@ +#!/bin/bash + +# This script is same as _d, but uses a weight of 1.0 for unsupervised set. +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 5,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup15k_250k # for reference +exp=exp/semisup_15k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup15k +semi_affix=semi15k_250k # affix relating train-set splitting proportion + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1r # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix= +unsup_egs_opts= +apply_deriv_weights=true + + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +tree_affix= + +RANDOM=0 + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri3_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs_split.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=128/300=64" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --trainer.objective-scales="output-1-xent:0.5" \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${lang_test_suffix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_s.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_s.sh new file mode 100644 index 00000000000..ab78062f89d --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_s.sh @@ -0,0 +1,436 @@ +#!/bin/bash + +# This script is same as _d, but uses a weight of 1.0 for unsupervised set. +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 5,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup15k_250k # for reference +exp=exp/semisup_15k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup15k +semi_affix=semi15k_250k # affix relating train-set splitting proportion + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1s # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix= +unsup_egs_opts= +apply_deriv_weights=true + + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size="150=128/300=64" +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +tree_affix= + +RANDOM=0 + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri3_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs_split.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${lang_test_suffix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_e.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_e.sh new file mode 100755 index 00000000000..8750c88e627 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_e.sh @@ -0,0 +1,452 @@ +#!/bin/bash + +# This script is same as _d, but uses a weight of 1.0 for unsupervised set. +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 3,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup50k_250k # for reference +exp=exp/semisup_50k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup50k +semi_affix=semi50k_250k # affix relating train-set splitting proportion + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +lm_opts= +do_finetuning=false + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1e # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=3,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix=fg + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +tree_affix=${tree_affix}_${semi_affix} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor + + steps/nnet3/chain/build_tree_multiple_sources.sh \ + --frame-subsampling-factor $frame_subsampling_factor \ + --use-fmllr false \ + --cmd "$train_cmd" 10000 data/lang_chain \ + data/${supervised_set} $sup_ali_dir \ + data/${unsupervised_set} \ + $chaindir/best_path_${unsupervised_set}${decode_affix} \ + $treedir +fi + +dir=$exp/chain${nnet3_affix}/tdnn${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + --lm-opts "--num-extra-lm-states=2000" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri4a_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=128/300=64" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${lang_test_suffix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_f.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_f.sh new file mode 100755 index 00000000000..3af0410dbe1 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_f.sh @@ -0,0 +1,444 @@ +#!/bin/bash + +# This script is same as _d, but uses a weight of 1.0 for unsupervised set. +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 3,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup50k_250k # for reference +exp=exp/semisup_50k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup50k +semi_affix=semi50k_250k # affix relating train-set splitting proportion + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +lm_opts= +do_finetuning=false + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= +unsup_egs_opts= + +# Semi-supervised options +comb_affix=comb1f # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=3,2 +sup_egs_dir= +unsup_egs_dir= +comb_egs_dir= +apply_deriv_weights=true +tree_affix= + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri4a_${supervised_set}_lats +if [ -z "$comb_egs_dir" ] && [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + if [ -z "$comb_egs_dir" ]; then + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) + fi +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} + +if [ -z "$comb_egs_dir" ] && [ -z "$unsup_egs_dir" ]; then + [ -z $unsup_frames_per_eg ] && [ ! -z "$frames_per_eg" ] && unsup_frames_per_eg=$frames_per_eg + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs_split.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +if [ -z "$comb_egs_dir" ]; then + comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + + if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. + fi +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=128/300=64" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${lang_test_suffix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_g.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_g.sh new file mode 100755 index 00000000000..3679842c877 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_g.sh @@ -0,0 +1,439 @@ +#!/bin/bash + +# This script is same as _d, but uses a weight of 1.0 for unsupervised set. +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 3,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup50k_250k # for reference +exp=exp/semisup_50k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup50k +semi_affix=semi50k_250k # affix relating train-set splitting proportion + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +lm_opts= +do_finetuning=false + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= +unsup_egs_opts= + +# Semi-supervised options +comb_affix=comb1g # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=3,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix= + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri4a_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=128/300=64" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${lang_test_suffix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/run_15k.sh b/egs/fisher_english/s5/local/semisup/run_15k.sh index 7d5a2589a21..41590dd9fe2 100644 --- a/egs/fisher_english/s5/local/semisup/run_15k.sh +++ b/egs/fisher_english/s5/local/semisup/run_15k.sh @@ -65,7 +65,7 @@ local/semisup/chain/tuning/run_tdnn_11k.sh \ --ivector-train-set semisup15k_250k || exit 1 } -local/semisup/chain/tuning/run_tdnn_oracle.sh \ +false && local/semisup/chain/tuning/run_tdnn_oracle.sh \ --train-set semisup15k_250k \ --nnet3-affix _semi15k_250k \ --chain-affix _semi15k_250k_oracle \ diff --git a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py index 5f2da55d110..b415a44ea16 100644 --- a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py +++ b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py @@ -130,7 +130,7 @@ def train_new_models(dir, iter, srand, num_jobs, shuffle_buffer_size, num_chunk_per_minibatch_str, frame_subsampling_factor, truncate_deriv_weights, run_opts, backstitch_training_scale=0.0, backstitch_training_interval=1, - use_multitask_egs=False, smbr_opt=""): + use_multitask_egs=False, objective_opts=""): """ Called from train_one_iteration(), this method trains new models with 'num_jobs' jobs, and @@ -189,7 +189,7 @@ def train_new_models(dir, iter, srand, num_jobs, thread = common_lib.background_command( """{command} {train_queue_opt} {dir}/log/train.{iter}.{job}.log \ nnet3-chain-train {parallel_train_opts} {verbose_opt} \ - --apply-deriv-weights={app_deriv_wts} {smbr_opt} \ + --apply-deriv-weights={app_deriv_wts} {objective_opts} \ --l2-regularize={l2} --leaky-hmm-coefficient={leaky} \ {cache_io_opts} --xent-regularize={xent_reg} \ {deriv_time_opts} \ @@ -228,7 +228,7 @@ def train_new_models(dir, iter, srand, num_jobs, num_chunk_per_mb=num_chunk_per_minibatch_str, multitask_egs_opts=multitask_egs_opts, scp_or_ark=scp_or_ark, - smbr_opt=smbr_opt), + objective_opts=objective_opts), require_zero_status=True) threads.append(thread) @@ -250,7 +250,7 @@ def train_one_iteration(dir, iter, srand, egs_dir, run_opts, dropout_edit_string="", backstitch_training_scale=0.0, backstitch_training_interval=1, use_multitask_egs=False, - smbr_opt=""): + objective_opts=""): """ Called from steps/nnet3/chain/train.py for one iteration for neural network training with LF-MMI objective @@ -282,7 +282,7 @@ def train_one_iteration(dir, iter, srand, egs_dir, l2_regularize=l2_regularize, xent_regularize=xent_regularize, leaky_hmm_coefficient=leaky_hmm_coefficient, run_opts=run_opts, use_multitask_egs=use_multitask_egs, - smbr_opt=smbr_opt) + objective_opts=objective_opts) if iter > 0: # Runs in the background @@ -313,8 +313,8 @@ def train_one_iteration(dir, iter, srand, egs_dir, if shrinkage_value != 1.0: shrink_info_str = ' and shrink value is {0}'.format(shrinkage_value) - objf_info = "" if smbr_opt == "" else ( - "and objective is sMBR and smbr_opt=" + smbr_opt) + objf_info = "" if objective_opts == "" else ( + "and objective_opts=" + objective_opts) logger.info("On iteration {0}, learning rate is {1}" "{shrink_info} {objf_info}.".format( iter, learning_rate, @@ -344,7 +344,7 @@ def train_one_iteration(dir, iter, srand, egs_dir, iter / 15 if iter < 15 else backstitch_training_scale), backstitch_training_interval=backstitch_training_interval, use_multitask_egs=use_multitask_egs, - smbr_opt=smbr_opt) + objective_opts=objective_opts) [models_to_average, best_model] = common_train_lib.get_successful_models( num_jobs, '{0}/log/train.{1}.%.log'.format(dir, iter)) @@ -490,7 +490,7 @@ def compute_train_cv_probabilities(dir, iter, egs_dir, l2_regularize, xent_regularize, leaky_hmm_coefficient, run_opts, use_multitask_egs=False, - smbr_opt=""): + objective_opts=""): model = '{0}/{1}.mdl'.format(dir, iter) scp_or_ark = "scp" if use_multitask_egs else "ark" egs_suffix = ".scp" if use_multitask_egs else ".cegs" @@ -503,7 +503,7 @@ def compute_train_cv_probabilities(dir, iter, egs_dir, l2_regularize, common_lib.background_command( """{command} {dir}/log/compute_prob_valid.{iter}.log \ - nnet3-chain-compute-prob --l2-regularize={l2} {smbr_opt} \ + nnet3-chain-compute-prob --l2-regularize={l2} {objective_opts} \ --leaky-hmm-coefficient={leaky} --xent-regularize={xent_reg} \ "nnet3-am-copy --raw=true {model} - |" {dir}/den.fst \ "ark,bg:nnet3-chain-copy-egs {multitask_egs_opts} {scp_or_ark}:{egs_dir}/valid_diagnostic{egs_suffix} \ @@ -514,7 +514,7 @@ def compute_train_cv_probabilities(dir, iter, egs_dir, l2_regularize, egs_dir=egs_dir, multitask_egs_opts=multitask_egs_opts, scp_or_ark=scp_or_ark, egs_suffix=egs_suffix, - smbr_opt=smbr_opt)) + objective_opts=objective_opts)) multitask_egs_opts = common_train_lib.get_multitask_egs_opts( egs_dir, @@ -523,7 +523,7 @@ def compute_train_cv_probabilities(dir, iter, egs_dir, l2_regularize, common_lib.background_command( """{command} {dir}/log/compute_prob_train.{iter}.log \ - nnet3-chain-compute-prob --l2-regularize={l2} {smbr_opt} \ + nnet3-chain-compute-prob --l2-regularize={l2} {objective_opts} \ --leaky-hmm-coefficient={leaky} --xent-regularize={xent_reg} \ "nnet3-am-copy --raw=true {model} - |" {dir}/den.fst \ "ark,bg:nnet3-chain-copy-egs {multitask_egs_opts} {scp_or_ark}:{egs_dir}/train_diagnostic{egs_suffix} \ @@ -534,7 +534,7 @@ def compute_train_cv_probabilities(dir, iter, egs_dir, l2_regularize, egs_dir=egs_dir, multitask_egs_opts=multitask_egs_opts, scp_or_ark=scp_or_ark, egs_suffix=egs_suffix, - smbr_opt=smbr_opt)) + objective_opts=objective_opts)) def compute_progress(dir, iter, run_opts): @@ -560,7 +560,7 @@ def combine_models(dir, num_iters, models_to_combine, num_chunk_per_minibatch_st xent_regularize, run_opts, sum_to_one_penalty=0.0, use_multitask_egs=False, - smbr_opt=""): + objective_opts=""): """ Function to do model combination In the nnet3 setup, the logic @@ -604,7 +604,7 @@ def combine_models(dir, num_iters, models_to_combine, num_chunk_per_minibatch_st common_lib.execute_command( """{command} {combine_queue_opt} {dir}/log/combine.log \ - nnet3-chain-combine --num-iters={opt_iters} {smbr_opt} \ + nnet3-chain-combine --num-iters={opt_iters} {objective_opts} \ --l2-regularize={l2} --leaky-hmm-coefficient={leaky} \ --separate-weights-per-component={separate_weights} \ --enforce-sum-to-one={hard_enforce} \ @@ -629,7 +629,7 @@ def combine_models(dir, num_iters, models_to_combine, num_chunk_per_minibatch_st egs_dir=egs_dir, multitask_egs_opts=multitask_egs_opts, scp_or_ark=scp_or_ark, egs_suffix=egs_suffix, - smbr_opt=smbr_opt)) + objective_opts=objective_opts)) # Compute the probability of the final, combined model with # the same subset we used for the previous compute_probs, as the @@ -640,4 +640,4 @@ def combine_models(dir, num_iters, models_to_combine, num_chunk_per_minibatch_st leaky_hmm_coefficient=leaky_hmm_coefficient, run_opts=run_opts, use_multitask_egs=use_multitask_egs, - smbr_opt=smbr_opt) + objective_opts=objective_opts) diff --git a/egs/wsj/s5/steps/libs/nnet3/train/common.py b/egs/wsj/s5/steps/libs/nnet3/train/common.py index bb5104abc90..667b0d5e1ca 100644 --- a/egs/wsj/s5/steps/libs/nnet3/train/common.py +++ b/egs/wsj/s5/steps/libs/nnet3/train/common.py @@ -917,6 +917,11 @@ def __init__(self, action=common_lib.StrToBoolAction, help="Compute train and validation " "accuracy per-dim") + self.parser.add_argument("--trainer.objective-scales", + dest='objective_scales', + type=str, + action=common_lib.NullstrToNoneAction, + help="Objective scales for different outputs") # General options self.parser.add_argument("--stage", type=int, default=-4, diff --git a/egs/wsj/s5/steps/nnet3/chain/get_egs.sh b/egs/wsj/s5/steps/nnet3/chain/get_egs.sh index ad726686e09..749059f3475 100755 --- a/egs/wsj/s5/steps/nnet3/chain/get_egs.sh +++ b/egs/wsj/s5/steps/nnet3/chain/get_egs.sh @@ -77,6 +77,7 @@ acwt=0.1 # For pruning phone_insertion_penalty= deriv_weights_scp= generate_egs_scp=false +no_chunking=false echo "$0 $@" # Print the command line for logging @@ -125,6 +126,8 @@ dir=$4 [ ! -z "$online_ivector_dir" ] && \ extra_files="$online_ivector_dir/ivector_online.scp $online_ivector_dir/ivector_period" +$no_chunking && extra_files="$extra_files $data/allowed_lengths.txt" + for f in $data/feats.scp $latdir/lat.1.gz $latdir/final.mdl \ $chaindir/{0.trans_mdl,tree,normalization.fst} $extra_files; do [ ! -f $f ] && echo "$0: no such file $f" && exit 1; @@ -142,9 +145,16 @@ num_lat_jobs=$(cat $latdir/num_jobs) || exit 1; frame_shift=$(utils/data/get_frame_shift.sh $data) || exit 1 utils/data/get_utt2dur.sh $data -cat $data/utt2dur | \ - awk -v min_len=$frames_per_eg -v fs=$frame_shift '{if ($2 * 1/fs >= min_len) print $1}' | \ - utils/shuffle_list.pl | head -$num_utts_subset > $dir/valid_uttlist || exit 1; +if $no_chunking; then + frames_per_eg=$(cat $data/allowed_lengths.txt | tr '\n' , | sed 's/,$//') + + cut -d ' ' -f 1 $data/utt2spk | \ + utils/shuffle_list.pl | head -$num_utts_subset > $dir/valid_uttlist || exit 1; +else + cat $data/utt2dur | \ + awk -v min_len=$frames_per_eg -v fs=$frame_shift '{if ($2 * 1/fs >= min_len) print $1}' | \ + utils/shuffle_list.pl | head -$num_utts_subset > $dir/valid_uttlist || exit 1; +fi len_uttlist=`wc -l $dir/valid_uttlist | awk '{print $1}'` if [ $len_uttlist -lt $num_utts_subset ]; then @@ -164,10 +174,17 @@ if [ -f $data/utt2uniq ]; then # this matters if you use data augmentation. rm $dir/uniq2utt $dir/valid_uttlist.tmp fi -cat $data/utt2dur | \ - awk -v min_len=$frames_per_eg -v fs=$frame_shift '{if ($2 * 1/fs >= min_len) print $1}' | \ +if $no_chunking; then + cut -d ' ' -f 1 $data/utt2spk | \ utils/filter_scp.pl --exclude $dir/valid_uttlist | \ utils/shuffle_list.pl | head -$num_utts_subset > $dir/train_subset_uttlist || exit 1; +else + cat $data/utt2dur | \ + awk -v min_len=$frames_per_eg -v fs=$frame_shift '{if ($2 * 1/fs >= min_len) print $1}' | \ + utils/filter_scp.pl --exclude $dir/valid_uttlist | \ + utils/shuffle_list.pl | head -$num_utts_subset > $dir/train_subset_uttlist || exit 1; +fi + len_uttlist=`wc -l $dir/train_subset_uttlist | awk '{print $1}'` if [ $len_uttlist -lt $num_utts_subset ]; then echo "Number of utterances which have length at least $frames_per_eg is really low. Please check your data." && exit 1; @@ -288,6 +305,7 @@ fi egs_opts="--left-context=$left_context --right-context=$right_context --num-frames=$frames_per_eg --frame-subsampling-factor=$frame_subsampling_factor --compress=$compress" [ $left_context_initial -ge 0 ] && egs_opts="$egs_opts --left-context-initial=$left_context_initial" [ $right_context_final -ge 0 ] && egs_opts="$egs_opts --right-context-final=$right_context_final" +$no_chunking && egs_opts="$egs_opts --no-chunking" [ ! -z "$deriv_weights_scp" ] && egs_opts="$egs_opts --deriv-weights-rspecifier=scp:$deriv_weights_scp" diff --git a/egs/wsj/s5/steps/nnet3/chain/get_egs_split.sh b/egs/wsj/s5/steps/nnet3/chain/get_egs_split.sh index c4baf3c4ea1..ada04aef31c 100755 --- a/egs/wsj/s5/steps/nnet3/chain/get_egs_split.sh +++ b/egs/wsj/s5/steps/nnet3/chain/get_egs_split.sh @@ -322,6 +322,8 @@ if [ ! -z $left_tolerance_silence ] && [ ! -z $right_tolerance_silence ]; then chain_supervision_all_opts="$chain_supervision_all_opts --supervision.silence-phones=$(cat $lang/phones/silence_phones.csl)" fi +chain_supervision_all_opts="$chain_supervision_all_opts --acoustic-scale=$acwt" + echo $left_context > $dir/info/left_context echo $right_context > $dir/info/right_context echo $left_context_initial > $dir/info/left_context_initial @@ -473,7 +475,7 @@ if [ $stage -le 5 ]; then #concatenate cegs.JOB.scp in single cegs.scp rm -rf $dir/cegs.scp for j in $(seq $num_archives_intermediate); do - for y in $(seq $num_archives_intermediate); do + for y in $(seq $archives_multiple); do cat $dir/cegs.$j.$y.scp || exit 1; done done > $dir/cegs.scp || exit 1; diff --git a/egs/wsj/s5/steps/nnet3/chain/train.py b/egs/wsj/s5/steps/nnet3/chain/train.py index d0e6d27e984..9d94bebe4f7 100755 --- a/egs/wsj/s5/steps/nnet3/chain/train.py +++ b/egs/wsj/s5/steps/nnet3/chain/train.py @@ -545,14 +545,15 @@ def train(args, run_opts): xent_regularize = args.xent_regularize l2_regularize = args.l2_regularize - smbr_opt = "" + objective_opts = ("--objective-scales=" + args.objective_scales + if args.objective_scales is not None else "") smbr_factor = 0.0 if args.smbr_factor_schedule is not None: smbr_factor = common_train_lib.get_schedule_value( args.smbr_factor_schedule, float(num_archives_processed) / num_archives_to_process) - smbr_opt += " --smbr-factor={0}".format(smbr_factor) + objective_opts += " --smbr-factor={0}".format(smbr_factor) if smbr_factor > 0.0: use_smbr=True @@ -562,16 +563,16 @@ def train(args, run_opts): l2_regularize = (args.smbr_l2_regularize if args.smbr_l2_regularize is not None else args.l2_regularize) - smbr_opt += " --use-smbr-objective" + objective_opts += " --use-smbr-objective" if silence_pdfs is not None: - smbr_opt += " --silence-pdfs=" + silence_pdfs + objective_opts += " --silence-pdfs=" + silence_pdfs if args.mmi_factor_schedule is not None: mmi_factor = common_train_lib.get_schedule_value( args.mmi_factor_schedule, float(num_archives_processed) / num_archives_to_process) - smbr_opt += " --mmi-factor={0}".format(mmi_factor) + objective_opts += " --mmi-factor={0}".format(mmi_factor) percent = num_archives_processed * 100.0 / num_archives_to_process epoch = (num_archives_processed * args.num_epochs @@ -616,7 +617,7 @@ def train(args, run_opts): backstitch_training_scale=args.backstitch_training_scale, backstitch_training_interval=args.backstitch_training_interval, use_multitask_egs=use_multitask_egs, - smbr_opt=smbr_opt) + objective_opts=objective_opts) if args.cleanup: # do a clean up everythin but the last 2 models, under certain @@ -642,13 +643,14 @@ def train(args, run_opts): if args.stage <= num_iters: xent_regularize = args.xent_regularize l2_regularize = args.l2_regularize - smbr_opt = "" + objective_opts = ("--objective-scales=" + args.objective_scales + if args.objective_scales is not None else "") smbr_factor = 0.0 if args.smbr_factor_schedule is not None: smbr_factor = common_train_lib.get_schedule_value( args.smbr_factor_schedule, 1.0) - smbr_opt += " --smbr-factor={0}".format(smbr_factor) + objective_opts += " --smbr-factor={0}".format(smbr_factor) if smbr_factor > 0.0: use_smbr=True @@ -658,15 +660,15 @@ def train(args, run_opts): l2_regularize = (args.smbr_l2_regularize if args.smbr_l2_regularize is not None else args.l2_regularize) - smbr_opt = "--use-smbr-objective" + objective_opts = "--use-smbr-objective" if silence_pdfs is not None: - smbr_opt += " --silence-pdfs=" + silence_pdfs + objective_opts += " --silence-pdfs=" + silence_pdfs if args.mmi_factor_schedule is not None: mmi_factor = common_train_lib.get_schedule_value( args.mmi_factor_schedule, 1.0) - smbr_opt += " --mmi-factor={0}".format(mmi_factor) + objective_opts += " --mmi-factor={0}".format(mmi_factor) if args.do_final_combination: logger.info("Doing final combination to produce final.mdl") @@ -682,7 +684,7 @@ def train(args, run_opts): run_opts=run_opts, sum_to_one_penalty=args.combine_sum_to_one_penalty, use_multitask_egs=use_multitask_egs, - smbr_opt=smbr_opt) + objective_opts=objective_opts) else: logger.info("Copying the last-numbered model to final.mdl") common_lib.force_symlink("{0}.mdl".format(num_iters), @@ -693,7 +695,7 @@ def train(args, run_opts): leaky_hmm_coefficient=args.leaky_hmm_coefficient, run_opts=run_opts, use_multitask_egs=use_multitask_egs, - smbr_opt=smbr_opt) + objective_opts=objective_opts) common_lib.force_symlink("compute_prob_valid.{iter}.log" "".format(iter=num_iters-1), "{dir}/log/compute_prob_valid.final.log".format( diff --git a/egs/wsj/s5/steps/nnet3/decode.sh b/egs/wsj/s5/steps/nnet3/decode.sh index 275355d6695..7d4097789a7 100755 --- a/egs/wsj/s5/steps/nnet3/decode.sh +++ b/egs/wsj/s5/steps/nnet3/decode.sh @@ -32,6 +32,7 @@ extra_left_context_initial=-1 extra_right_context_final=-1 online_ivector_dir= minimize=false +determinize_opts= write_compact=true # End configuration section. @@ -119,11 +120,11 @@ if [ ! -z "$online_ivector_dir" ]; then ivector_opts="--online-ivectors=scp:$online_ivector_dir/ivector_online.scp --online-ivector-period=$ivector_period" fi -opts= +extra_opts= lat_wspecifier="ark:|" if ! $write_compact; then - opts="--determinize-lattice=false" - lat_wspecifier="ark:| lattice-determinize-phone-pruned-non-compact --beam=$lattice_beam --acoustic-scale=$acwt --minimize=$minimize $model ark:- ark:- |" + extra_opts="--determinize-lattice=false" + lat_wspecifier="ark:| lattice-determinize-phone-pruned-non-compact --beam=$lattice_beam --acoustic-scale=$acwt --minimize=$minimize $determinize_opts $model ark:- ark:- |" fi if [ "$post_decode_acwt" == 1.0 ]; then @@ -148,7 +149,7 @@ if [ $stage -le 1 ]; then --extra-right-context-final=$extra_right_context_final \ --minimize=$minimize --max-active=$max_active --min-active=$min_active --beam=$beam \ --lattice-beam=$lattice_beam --acoustic-scale=$acwt --allow-partial=true \ - --word-symbol-table=$graphdir/words.txt ${opts} \ + --word-symbol-table=$graphdir/words.txt ${extra_opts} \ "$model" \ $graphdir/HCLG.fst "$feats" "$lat_wspecifier" || exit 1; fi diff --git a/egs/wsj/s5/steps/nnet3/multilingual/combine_egs.sh b/egs/wsj/s5/steps/nnet3/multilingual/combine_egs.sh index dd8d9714905..2fc6357aa46 100755 --- a/egs/wsj/s5/steps/nnet3/multilingual/combine_egs.sh +++ b/egs/wsj/s5/steps/nnet3/multilingual/combine_egs.sh @@ -72,6 +72,7 @@ for param in $check_params; do cat ${args[0]}/$param > $megs_dir/$param || exit 1; done cat ${args[0]}/cmvn_opts > $megs_dir/cmvn_opts || exit 1; # caution: the top-level nnet training +cp ${args[0]}/info/frames_per_eg $megs_dir/info/frames_per_eg || exit 1; for lang in $(seq 0 $[$num_langs-1]);do multi_egs_dir[$lang]=${args[$lang]} @@ -85,11 +86,10 @@ for lang in $(seq 0 $[$num_langs-1]);do valid_diagnostic_scp_list="$valid_diagnostic_scp_list ${args[$lang]}/valid_diagnostic.scp" combine_scp_list="$combine_scp_list ${args[$lang]}/combine.scp" - this_frames_per_eg=$(cat ${args[$lang]}/info/frames_per_eg) + this_frames_per_eg=$(cat ${args[$lang]}/info/frames_per_eg | cut -d, -f 1) # use only the primary frames-per-eg if [ $lang -eq 0 ]; then frames_per_eg_list="$this_frames_per_eg" - echo $this_frames_per_eg > $megs_dir/info/frames_per_eg else frames_per_eg_list="$frames_per_eg_list,$this_frames_per_eg" fi diff --git a/egs/wsj/s5/utils/data/perturb_speed_to_allowed_lengths.py b/egs/wsj/s5/utils/data/perturb_speed_to_allowed_lengths.py new file mode 100755 index 00000000000..1cd90e29517 --- /dev/null +++ b/egs/wsj/s5/utils/data/perturb_speed_to_allowed_lengths.py @@ -0,0 +1,243 @@ +#!/usr/bin/env python + +# Copyright 2017 Johns Hopkins University (author: Hossein Hadian) +# Apache 2.0 + + +""" This script perturbs speeds of utterances to force their lengths to some allowed + lengths spaced by a factor +""" + +import argparse +import os +import sys +import copy +import math + +parser = argparse.ArgumentParser(description="""This script ...""") +parser.add_argument('factor', type=float, default=12, + help='spacing (in percentage) between allowed lengths.') +parser.add_argument('srcdir', type=str, + help='path to source data dir') +parser.add_argument('dir', type=str, help='output dir') +parser.add_argument('--range-factor', type=float, default=0.05, + help="""Percentage of durations not covered from each side of + duration histogram.""") +parser.add_argument('--no-speed-perturb', action='store_true') + +args = parser.parse_args() + +### functions and classes ### + +class Speaker: + def __init__(self, path, sid): + self.path = path + self.name = os.path.basename(os.path.normpath(path)) + self.id = sid + self.utterances = [] + def str_id(self): + return "s" + zero_pad(str(self.id), 4) + +class Utterance: + def __init__(self, uid, wavefile, speaker, transcription, dur): + self.wavefile = wavefile + self.speaker = speaker + self.transcription = transcription + self.id = uid + self.dur = float(dur) + + def to_kaldi_utt_str(self): + return self.id + " " + self.transcription + + def to_kaldi_wave_str(self): + return self.id + " " + self.wavefile + + +def read_kaldi_datadir(dir): + utts = [] + wav_scp = read_kaldi_mapfile(os.path.join(dir, 'wav.scp')) + text = read_kaldi_mapfile(os.path.join(dir, 'text')) + utt2dur = read_kaldi_mapfile(os.path.join(dir, 'utt2dur')) + utt2spk = read_kaldi_mapfile(os.path.join(dir, 'utt2spk')) + for utt in wav_scp: + if utt in text and utt in utt2dur and utt in utt2spk: + utts += [Utterance(utt, wav_scp[utt], utt2spk[utt], text[utt], utt2dur[utt])] + else: + print('Incomplete data for utt {}'.format(utt)) + return utts + + +def read_kaldi_mapfile(path): + m = {} + with open(path, 'r') as f: + for line in f: + line = line.rstrip() + sp_pos = line.find(' ') + key = line[:sp_pos] + val = line[sp_pos+1:] + m[key] = val + return m + +def generate_kaldi_data_files(utterances, outdir): + print "Exporting to ", outdir, "..." + spks = {} + + f = open(os.path.join(outdir, 'text'), 'w') + for utt in utterances: + f.write(utt.to_kaldi_utt_str() + "\n") + f.close() + + f = open(os.path.join(outdir, 'wav.scp'), 'w') + for utt in utterances: + f.write(utt.to_kaldi_wave_str() + "\n") + f.close() + + f = open(os.path.join(outdir, 'utt2dur'), 'w') + for utt in utterances: + f.write(utt.id + " " + str(utt.dur) + "\n") + f.close() + + f = open(os.path.join(outdir, 'utt2spk'), 'w') + for utt in utterances: + f.write(utt.id + " " + utt.speaker + "\n") + if utt.speaker not in spks: + spks[utt.speaker] = [utt.id] + else: + spks[utt.speaker] += [utt.id] + f.close() + + f = open(os.path.join(outdir, 'spk2utt'), 'w') + for s in spks: + f.write(s + " ") + for utt in spks[s]: + f.write(utt + " ") + f.write('\n') + f.close() + + + + +### main ### + +if not os.path.exists(args.dir): + os.makedirs(args.dir) + +# 0. load src dir +utts = read_kaldi_datadir(args.srcdir) + +factor = 1.0 + float(args.factor)/100 +# 1a. find start-dur and end-dur +## echo "Durs = [" >durs.m && cut -d' ' -f2 data/train_nodup_seg/utt2dur | tr '\n' ',' >>durs.m && echo " ];" >>durs.m +durs = [] +for u in utts: + durs += [u.dur] +durs.sort() +to_ignore_dur = 0 +tot_dur = sum(durs) +for d in durs: + to_ignore_dur += d + if to_ignore_dur * 100.0 / tot_dur > args.range_factor: + start_dur = d + break +to_ignore_dur = 0 +for d in reversed(durs): + to_ignore_dur += d + if to_ignore_dur * 100.0 / tot_dur > args.range_factor: + end_dur = d + break +print("Durations in the range [{},{}] will be covered. Coverage rate: {}%".format(start_dur, end_dur, 100.0-args.range_factor*2)) +print("There will be {} unique allowed lengths for the utterances.".format(int(math.log(end_dur/start_dur)/math.log(factor)))) +#sys.exit(0) + +# 1b. compute and write allowed lengths +#start_dur = 0.88 +#end_dur = 19.00 +durs = [] +d = start_dur +f = open(os.path.join(args.dir, 'allowed_durs.txt'), 'wb') +f2 = open(os.path.join(args.dir, 'allowed_lengths.txt'), 'wb') +while d < end_dur: + length = int(d*1000 - 25) / 10 + 1 # for the most common length of frames and overlap + if length % 3 != 0: + lo = 3 * (length / 3) + hi = lo + 3 + #if length - lo <= hi - length: + # length = lo + #else: + # length = hi + length = lo # should select lo to make sure the jump is not bigger than 12% + dnew = (10.0 * (length - 1.0) + 25.0 + 5.0) / 1000.0 # +5 is for safety + d = dnew + durs += [d] + f.write(str(d) + '\n') + f2.write(str(length) + '\n') + d *= factor +f.close() +f2.close() + +# 2. perturb to allowed durs +# sox -t wav seg1.wav -t wav long95.wav speed 0.873684211 +perturbed_utts = [] +durs = durs + [1000000] +for u in utts: + prev_d = 0.0 + i = 0 + for d in durs: + if u.dur <= d and u.dur >= prev_d: + break + i += 1 + prev_d = d + # i determines the closest allowed durs + + if i > 0: + allowed_dur = durs[i - 1] # this is smaller than u.dur + speed = u.dur / allowed_dur + if max(speed, 1.0/speed) > factor: + #print('rejected: {} --> dur was {} speed was {}'.format(u.id, u.dur, speed)) + continue + u1 = copy.deepcopy(u) + u1.id = 'pv1-' + u.id + u1.speaker = 'pv1-' + u.speaker + u1.wavefile = '{} sox -t wav - -t wav - speed {} | '.format(u.wavefile, speed) + u1.dur = allowed_dur + if not args.no_speed_perturb: + perturbed_utts += [u1] + + + if i < len(durs) - 1: + allowed_dur2 = durs[i] # this is bigger than u.dur + speed = u.dur / allowed_dur2 + if max(speed, 1.0/speed) > factor: + #print('no v2/v3 for: {} --> dur was {} speed was {}'.format(u.id, u.dur, speed)) + continue + + ## Add two versions for the second allowed_length + ## one version is by using speed modification using sox + ## the other is by extending by silence + u2 = copy.deepcopy(u) + u2.id = 'pv2-' + u.id + u2.speaker = 'pv2-' + u.speaker + u2.wavefile = '{} sox -t wav - -t wav - speed {} | '.format(u.wavefile, speed) + u2.dur = allowed_dur2 + if not args.no_speed_perturb: + perturbed_utts += [u2] + + delta = allowed_dur2 - u.dur + if delta <= 1e-4: + continue + u3 = copy.deepcopy(u) + prefix = 'pv3-' if not args.no_speed_perturb else '' + u3.id = prefix + u.id + u3.speaker = prefix + u.speaker + + parts = u.wavefile.split() + if len(parts) == 1: + u3.wavefile = 'extend-wav-with-silence --extra-silence-length={1} {0} - | '.format(u.wavefile, delta) + else: + assert parts[-1] == "|" + u3.wavefile = '{0} extend-wav-with-silence --extra-silence-length={1} - - | '.format(u.wavefile, delta) + u3.dur = allowed_dur2 + perturbed_utts += [u3] + +# 3. write to our dir +generate_kaldi_data_files(perturbed_utts, args.dir) diff --git a/src/chain/chain-supervision-splitter.cc b/src/chain/chain-supervision-splitter.cc index 0a07876a1a2..2fd73704a6c 100644 --- a/src/chain/chain-supervision-splitter.cc +++ b/src/chain/chain-supervision-splitter.cc @@ -29,10 +29,54 @@ namespace chain { typedef fst::ArcTpl LatticeArc; typedef fst::VectorFst Lattice; +void FstToLattice(const fst::StdVectorFst &fst, Lattice *lat) { + lat->DeleteStates(); + + int32 start_state = fst.Start(); + for (int32 i = 0; i < fst.NumStates(); i++) + lat->AddState(); + + lat->SetStart(start_state); + + for (fst::StdArc::StateId s = 0; s < fst.NumStates(); s++) { + for (fst::ArcIterator aiter(fst, s); + !aiter.Done(); aiter.Next()) { + const fst::StdArc &arc = aiter.Value(); + + LatticeWeight weight = LatticeWeight::One(); + weight.SetValue1(arc.weight.Value()); + + lat->AddArc(s, + LatticeArc(arc.ilabel, arc.olabel, weight, arc.nextstate)); + } + + if (fst.Final(s) != fst::TropicalWeight::Zero()) { + LatticeWeight weight = LatticeWeight::One(); + weight.SetValue1(fst.Final(s).Value()); + lat->SetFinal(s, weight); + } + } +} + SupervisionLatticeSplitter::SupervisionLatticeSplitter( const SupervisionLatticeSplitterOptions &opts, - const TransitionModel &trans_model, const Lattice &lat): - opts_(opts), trans_model_(trans_model), lat_(lat) { + const SupervisionOptions &sup_opts, + const TransitionModel &trans_model): + sup_opts_(sup_opts), opts_(opts), trans_model_(trans_model), + incomplete_phone_(trans_model.NumPhones() + 1) { + + if (opts_.add_partial_unk_label_left) { + MakeFilterFst(); + } + + if (opts_.add_tolerance_to_lat) { + MakeToleranceEnforcerFst(); + } +} + +void SupervisionLatticeSplitter::LoadLattice(const Lattice &lat) { + lat_ = lat; + PrepareLattice(); int32 num_states = lat_.NumStates(); @@ -47,21 +91,58 @@ SupervisionLatticeSplitter::SupervisionLatticeSplitter( KALDI_ASSERT(lat_scores_.state_times[start_state] == 0); } -void SupervisionLatticeSplitter::GetFrameRange( +bool SupervisionLatticeSplitter::GetFrameRangeSupervision( int32 begin_frame, int32 num_frames, - Lattice *lat_out) const { + Supervision *supervision, + Lattice *out_lat) const { int32 end_frame = begin_frame + num_frames; // Note: end_frame is not included in the range of frames that the // output supervision object covers; it's one past the end. KALDI_ASSERT(num_frames > 0 && begin_frame >= 0 && begin_frame + num_frames <= lat_scores_.state_times.back()); - CreateRangeLattice(begin_frame, end_frame, lat_out); + Lattice lat_out; + CreateRangeLattice(begin_frame, end_frame, &lat_out); - if (opts_.acoustic_scale != 1.0) { - fst::ScaleLattice(fst::AcousticLatticeScale( - 1.0 / opts_.acoustic_scale), lat_out); + PostProcessLattice(&lat_out); + + if (out_lat) { + *out_lat = lat_out; + } + + ScaleLattice(fst::LatticeScale(sup_opts_.lm_scale, 0.0), &lat_out); + + supervision->frames_per_sequence = num_frames; + return GetSupervision(lat_out, supervision); +} + +bool SupervisionLatticeSplitter::GetFrameRangeProtoSupervision( + const ContextDependencyInterface &ctx_dep, + const TransitionModel &trans_model, + int32 begin_frame, int32 num_frames, + ProtoSupervision *proto_supervision) const { + + int32 end_frame = begin_frame + num_frames; + // Note: end_frame is not included in the range of frames that the + // output supervision object covers; it's one past the end. + KALDI_ASSERT(num_frames > 0 && begin_frame >= 0 && + begin_frame + num_frames <= lat_scores_.state_times.back()); + + Lattice lat_out; + CreateRangeLattice(begin_frame, end_frame, &lat_out); + + PostProcessLattice(&lat_out); + + if (opts_.debug && GetVerboseLevel() > 2) { + WriteLattice(std::cerr, false, lat_out); } + + CompactLattice clat_part; + ConvertLattice(lat_out, &clat_part); + + + return PhoneLatticeToProtoSupervision(sup_opts_, clat_part, + proto_supervision); } void SupervisionLatticeSplitter::LatticeInfo::Check() const { @@ -72,13 +153,12 @@ void SupervisionLatticeSplitter::LatticeInfo::Check() const { // Check that the states are ordered in increasing order of state_times. // This must be true since the states are in breadth-first search order. KALDI_ASSERT(IsSorted(state_times)); + + KALDI_ASSERT(state_times.back() == num_frames); } void SupervisionLatticeSplitter::PrepareLattice() { - // Scale the lattice to appropriate acoustic scale. It is important to - // ensure this is equal to the acoustic scale used while training. This is - // because, on splitting lattices, the initial and final costs are added - // into the graph cost. + // Scale the lattice to appropriate acoustic scale. KALDI_ASSERT(opts_.acoustic_scale != 0.0); if (opts_.acoustic_scale != 1.0) fst::ScaleLattice(fst::AcousticLatticeScale( @@ -123,13 +203,16 @@ void SupervisionLatticeSplitter::CreateRangeLattice( end_iter = std::lower_bound(begin_iter, state_times.end(), end_frame); + // begin_iter should point to the first state with time == begin_frame KALDI_ASSERT(*begin_iter == begin_frame && (begin_iter == state_times.begin() || begin_iter[-1] < begin_frame)); + // even if end_frame == supervision_.num_frames, there should be a state with // that frame index. KALDI_ASSERT(end_iter[-1] < end_frame && (end_iter < state_times.end() || *end_iter == end_frame)); + StateId begin_state = begin_iter - state_times.begin(), end_state = end_iter - state_times.begin(); @@ -141,13 +224,18 @@ void SupervisionLatticeSplitter::CreateRangeLattice( StateId start_state = out_lat->AddState(); out_lat->SetStart(start_state); + KALDI_ASSERT(out_lat->Start() == 0); + for (StateId i = begin_state; i < end_state; i++) out_lat->AddState(); // Add the special final-state. StateId final_state = out_lat->AddState(); out_lat->SetFinal(final_state, LatticeWeight::One()); - + + StateId prefinal_state = final_state + 1; + bool need_prefinal_state = false; + for (StateId state = begin_state; state < end_state; state++) { StateId output_state = state - begin_state + 1; if (state_times[state] == begin_frame) { @@ -156,7 +244,8 @@ void SupervisionLatticeSplitter::CreateRangeLattice( // from our actual initial state. The weight on this // transition is the forward probability of the said 'initial state' LatticeWeight weight = LatticeWeight::One(); - weight.SetValue1((opts_.normalize ? lat_scores_.beta[0] : 0.0) - lat_scores_.alpha[state]); + weight.SetValue1((opts_.normalize ? lat_scores_.beta[0] : 0.0) + - lat_scores_.alpha[state]); // Add negative of the forward log-probability to the graph cost score, // since the acoustic scores would be changed later. // Assuming that the lattice is scaled with appropriate acoustic @@ -190,35 +279,200 @@ void SupervisionLatticeSplitter::CreateRangeLattice( // Note: We don't normalize here because that is already done with the // initial cost. - out_lat->AddArc(output_state, - LatticeArc(arc.ilabel, arc.olabel, weight, final_state)); + if (!opts_.add_partial_unk_label_left) { + out_lat->AddArc(output_state, + LatticeArc(arc.ilabel, arc.olabel, weight, final_state)); + } else { + fst::ArcIterator next_aiter(lat_, nextstate); + if (!next_aiter.Done() && next_aiter.Value().olabel == 0) { + // This is a split in the middle of a phone. + // So add an arc to the "prefinal state" from which there + // is an arc to the "final state" with special + // "incomplete phone" symbol on the output-label. + + if (!need_prefinal_state) { + prefinal_state = out_lat->AddState(); + need_prefinal_state = true; + } + + out_lat->AddArc(output_state, + LatticeArc(arc.ilabel, arc.olabel, weight, prefinal_state)); + } else { + out_lat->AddArc(output_state, + LatticeArc(arc.ilabel, arc.olabel, weight, final_state)); + } + } } else { StateId output_nextstate = nextstate - begin_state + 1; - if (opts_.add_phone_label_for_half_transition) { + Label olabel = arc.olabel; + + if (state_times[state] == begin_frame && + (opts_.add_partial_phone_label_right || + opts_.add_partial_unk_label_right)) { int32 tid = arc.ilabel; int32 phone = trans_model_.TransitionIdToPhone(tid); - Label olabel = arc.olabel; + if (opts_.add_partial_unk_label_right) { + KALDI_ASSERT(opts_.unk_phone > 0); + phone = opts_.unk_phone; + } if (olabel == 0) { + // This is a split in the middle of a phone. + // So add a phone label as the output label. olabel = phone; - } else { - KALDI_ASSERT(phone == olabel); } - out_lat->AddArc(output_state, - LatticeArc(arc.ilabel, olabel, arc.weight, output_nextstate)); - } else { - out_lat->AddArc(output_state, - LatticeArc(arc.ilabel, arc.olabel, arc.weight, output_nextstate)); + } + out_lat->AddArc(output_state, + LatticeArc(arc.ilabel, olabel, arc.weight, output_nextstate)); + } + } + } + + if (need_prefinal_state) { + // Add an "incomplete phone" label as the output symbol in the + // last arc + out_lat->AddArc(prefinal_state, + LatticeArc(0, incomplete_phone_, LatticeWeight::One(), + final_state)); + } + + KALDI_ASSERT(out_lat->Start() == 0); + + if (opts_.debug) { + Posterior post; + + Lattice &temp_lat(*out_lat); + //fst::RmEpsilon(&temp_lat); + fst::TopSort(&temp_lat); + + double like = LatticeForwardBackward(temp_lat, &post); + + KALDI_ASSERT(kaldi::ApproxEqual( + like + (opts_.normalize ? lat_scores_.beta[0] : 0.0), + lat_scores_.beta[0])); + + const Posterior &full_post = lat_scores_.post; + + for (int32 t = begin_frame; t < end_frame; t++) { + KALDI_ASSERT(full_post[t].size() == post[t - begin_frame].size()); + + for (int32 j = 0; j < full_post[t].size(); j++) { + KALDI_ASSERT(post[t - begin_frame][j].first == full_post[t][j].first); + if (post[t-begin_frame][j].second < 0.1) + continue; + if (!kaldi::ApproxEqual(post[t - begin_frame][j].second, + full_post[t][j].second)) { + WritePosterior(std::cerr, false, full_post); + WritePosterior(std::cerr, false, post); + + std::vector alphas; + std::vector betas; + ComputeLatticeAlphasAndBetas(temp_lat, false, &alphas, &betas); + + fst::StdVectorFst full_fst; + Lattice full_lat(lat_); + fst::ScaleLattice(fst::AcousticLatticeScale(0), &full_lat); + ConvertLattice(full_lat, &full_fst); + WriteFstKaldi(std::cerr, false, full_fst); + + fst::StdVectorFst split_fst; + fst::ScaleLattice(fst::AcousticLatticeScale(0), out_lat); + ConvertLattice(*out_lat, &split_fst); + WriteFstKaldi(std::cerr, false, split_fst); + + KALDI_ASSERT(false); } } } } } +void SupervisionLatticeSplitter::PostProcessLattice(Lattice *out_lat) const { + if (opts_.add_partial_unk_label_left) { + if (opts_.debug && GetVerboseLevel() > 2) { + WriteLattice(std::cerr, false, *out_lat); + } + + fst::TableComposeOptions compose_opts; + compose_opts.table_match_type = fst::MATCH_OUTPUT; + + Lattice filter_lat; + FstToLattice(filter_fst_, &filter_lat); + + Lattice temp_lat; + TableCompose(*out_lat, filter_lat, &temp_lat); + + std::swap(temp_lat, *out_lat); + + if (opts_.debug && GetVerboseLevel() > 2) { + WriteLattice(std::cerr, false, *out_lat); + } + } + + fst::RmEpsilon(out_lat); + + if (opts_.acoustic_scale != 1.0) { + fst::ScaleLattice(fst::AcousticLatticeScale( + 1.0 / opts_.acoustic_scale), out_lat); + } +} + +bool SupervisionLatticeSplitter::GetSupervision( + const Lattice &lat, Supervision *supervision) const { + fst::StdVectorFst transition_id_fst; + ConvertLattice(lat, &transition_id_fst); + Project(&transition_id_fst, fst::PROJECT_INPUT); // Keep only the transition-ids. + if (transition_id_fst.Properties(fst::kIEpsilons, true) != 0) { + // remove epsilons, if there are any. + fst::RmEpsilon(&transition_id_fst); + } + + KALDI_ASSERT(transition_id_fst.NumStates() > 0); + + if (opts_.add_tolerance_to_lat) { + fst::TableComposeOptions compose_opts; + compose_opts.table_match_type = fst::MATCH_INPUT; + + TableCompose(transition_id_fst, tolerance_fst_, &(supervision->fst), + compose_opts); + } else { + std::swap(transition_id_fst, supervision->fst); + } + + fst::Connect(&(supervision->fst)); + + // at this point supervision->fst will have pdf-ids plus one as the olabels, + // but still transition-ids as the ilabels. Copy olabels to ilabels. + fst::Project(&(supervision->fst), fst::PROJECT_OUTPUT); + + fst::RmEpsilon(&(supervision->fst)); + fst::DeterminizeInLog(&(supervision->fst)); + + KALDI_ASSERT(supervision->fst.Properties(fst::kIEpsilons, true) == 0); + if (supervision->fst.NumStates() == 0) { + KALDI_WARN << "Supervision FST is empty (too many phones for too few " + << "frames?)"; + // possibly there were too many phones for too few frames. + return false; + } + + supervision->weight = 1.0; + supervision->num_sequences = 1; + supervision->label_dim = trans_model_.NumPdfs(); + SortBreadthFirstSearch(&(supervision->fst)); + + return true; +} + void SupervisionLatticeSplitter::ComputeLatticeScores() { - LatticeStateTimes(lat_, &(lat_scores_.state_times)); + lat_scores_.Reset(); + lat_scores_.num_frames = LatticeStateTimes(lat_, &(lat_scores_.state_times)); + + if (opts_.debug) + LatticeForwardBackward(lat_, &(lat_scores_.post)); + ComputeLatticeAlphasAndBetas(lat_, false, &(lat_scores_.alpha), &(lat_scores_.beta)); lat_scores_.Check(); @@ -417,15 +671,41 @@ void ToleranceEnforcerFstCreator::MakeFst() { } KALDI_ASSERT(fst_->Start() == zero_offset_index_ * (num_forward_transitions_ + 1)); + + fst::ArcSort(fst_, fst::ILabelCompare()); } -void MakeToleranceEnforcerFst( - const SupervisionOptions &opts, const TransitionModel &trans_model, - fst::StdVectorFst *fst) { - ToleranceEnforcerFstCreator creator(opts, trans_model, fst); +void SupervisionLatticeSplitter::MakeToleranceEnforcerFst() { + ToleranceEnforcerFstCreator creator(sup_opts_, trans_model_, &tolerance_fst_); creator.MakeFst(); } +void SupervisionLatticeSplitter::MakeFilterFst() { + filter_fst_.DeleteStates(); + filter_fst_.AddState(); + filter_fst_.AddState(); + filter_fst_.AddState(); + + filter_fst_.SetStart(0); + + const std::vector &phones = trans_model_.GetPhones(); + for (std::vector::const_iterator it = phones.begin(); + it != phones.end(); ++it) { + filter_fst_.AddArc(0, fst::StdArc(*it, *it, + fst::TropicalWeight::One(), 0)); + filter_fst_.AddArc(0, fst::StdArc(*it, opts_.unk_phone, + fst::TropicalWeight::One(), 1)); + } + filter_fst_.AddArc(1, fst::StdArc(incomplete_phone_, 0, + fst::TropicalWeight::One(), 2)); + + filter_fst_.SetFinal(0, fst::TropicalWeight::One()); + filter_fst_.SetFinal(2, fst::TropicalWeight::One()); + + fst::ArcSort(&filter_fst_, fst::ILabelCompare()); +} + +/* bool PhoneLatticeToSupervision(const fst::StdVectorFst &tolerance_fst, const TransitionModel &trans_model, const Lattice &lat, @@ -476,6 +756,7 @@ bool PhoneLatticeToSupervision(const fst::StdVectorFst &tolerance_fst, SortBreadthFirstSearch(&(supervision->fst)); return true; } +*/ } // end namespace chain } // end namespace kaldi diff --git a/src/chain/chain-supervision-splitter.h b/src/chain/chain-supervision-splitter.h index 7e5ba7845bf..8b6b43caec5 100644 --- a/src/chain/chain-supervision-splitter.h +++ b/src/chain/chain-supervision-splitter.h @@ -35,11 +35,22 @@ typedef fst::VectorFst Lattice; struct SupervisionLatticeSplitterOptions { BaseFloat acoustic_scale; bool normalize; - bool add_phone_label_for_half_transition; + bool add_partial_phone_label_left; + bool add_partial_phone_label_right; + bool add_partial_unk_label_left; + bool add_partial_unk_label_right; + int32 unk_phone; + bool add_tolerance_to_lat; + bool debug; SupervisionLatticeSplitterOptions(): acoustic_scale(1.0), normalize(true), - add_phone_label_for_half_transition(false) { } + add_partial_phone_label_left(false), + add_partial_phone_label_right(false), + add_partial_unk_label_left(false), + add_partial_unk_label_right(false), + unk_phone(0), + add_tolerance_to_lat(true), debug(false) { } void Register(OptionsItf *opts) { opts->Register("acoustic-scale", &acoustic_scale, @@ -47,21 +58,52 @@ struct SupervisionLatticeSplitterOptions { opts->Register("normalize", &normalize, "Normalize the initial and final scores added to split " "lattices"); - opts->Register("add-phone-label-for-half-transition", - &add_phone_label_for_half_transition, - "Add a phone label to account for half phone transitions " - "in the split lattices"); + opts->Register("add-partial-phone-label-left", + &add_partial_phone_label_left, + "Add a phone label to account for partial phone transitions " + "in the left split lattices"); + opts->Register("add-partial-phone-label-right", + &add_partial_phone_label_right, + "Add a phone label to account for partial phone transitions " + "in the right split lattices"); + opts->Register("add-partial-unk-label-left", + &add_partial_unk_label_left, + "Add an UNK phone to account for partial phone transitions " + "in the left split lattices"); + opts->Register("add-partial-unk-label-right", + &add_partial_unk_label_right, + "Add an UNK phone to account for partial phone transitions " + "in the right split lattices"); + opts->Register("unk-phone", &unk_phone, + "UNK phone is added at half transition"); + opts->Register("add-tolerance-to-lat", &add_tolerance_to_lat, + "If this is true, then the tolerance is directly added " + "to the lattice by inserting or deleting self-loop " + "transitions"); + opts->Register("debug", &debug, + "Run some debug test codes"); } }; class SupervisionLatticeSplitter { public: SupervisionLatticeSplitter(const SupervisionLatticeSplitterOptions &opts, - const TransitionModel &trans_model, - const Lattice &lat); + const SupervisionOptions &sup_opts, + const TransitionModel &trans_model); - void GetFrameRange(int32 begin_frame, int32 frames_per_sequence, - Lattice *out_lat) const; + void LoadLattice(const Lattice &lat); + + bool GetFrameRangeSupervision(int32 begin_frame, int32 frames_per_sequence, + chain::Supervision *supervision, + Lattice *lat = NULL) const; + + bool GetFrameRangeProtoSupervision( + const ContextDependencyInterface &ctx_dep, + const TransitionModel &trans_model, + int32 begin_frame, int32 num_frames, + ProtoSupervision *proto_supervision) const; + + int32 NumFrames() const { return lat_scores_.num_frames; } // A structure used to store the forward and backward scores // and state times of a lattice @@ -70,6 +112,15 @@ class SupervisionLatticeSplitter { std::vector alpha; std::vector beta; std::vector state_times; + std::vector > > post; + int32 num_frames; + + void Reset() { + alpha.clear(); + beta.clear(); + state_times.clear(); + post.clear(); + } void Check() const; }; @@ -83,6 +134,10 @@ class SupervisionLatticeSplitter { void CreateRangeLattice(int32 begin_frame, int32 end_frame, Lattice *out_lat) const; + void PostProcessLattice(Lattice *out_lat) const; + + bool GetSupervision(const Lattice &out_lat, Supervision *supervision) const; + // Function to compute lattice scores for a lattice void ComputeLatticeScores(); @@ -91,18 +146,31 @@ class SupervisionLatticeSplitter { // 2) Compute states times, which must be a strictly non-decreasing vector // 3) Compute lattice alpha and beta scores void PrepareLattice(); - + + const SupervisionOptions &sup_opts_; + const SupervisionLatticeSplitterOptions &opts_; const TransitionModel &trans_model_; - // LatticeInfo object for lattice. - // This will be computed when PrepareLattice function is called. - LatticeInfo lat_scores_; + fst::StdVectorFst tolerance_fst_; + void MakeToleranceEnforcerFst(); - // Copy of the lattice. This is required because the lattice states + const int32 incomplete_phone_; // Equal to trans_model_.NumPhones() + 1 + + // Used to remove "incomplete phone" label + // Applicable only when opts_.add_partial_unk_label_left is true. + fst::StdVectorFst filter_fst_; + void MakeFilterFst(); + + // Copy of the lattice loaded using LoadLattice(). + // This is required because the lattice states // need to be ordered in breadth-first search order. Lattice lat_; + + // LatticeInfo object for lattice. + // This will be computed when PrepareLattice function is called. + LatticeInfo lat_scores_; }; bool PhoneLatticeToSupervision(const fst::StdVectorFst &tolerance_fst, @@ -111,9 +179,42 @@ bool PhoneLatticeToSupervision(const fst::StdVectorFst &tolerance_fst, chain::Supervision *supervision, bool debug = false); -void MakeToleranceEnforcerFst( - const SupervisionOptions &opts, const TransitionModel &trans_model, - fst::StdVectorFst *fst); +void FixLattice(const fst::StdVectorFst &lattice_fixer_fst, + const Lattice &lat, CompactLattice *clat); + +void MakeLatticeFixerFst(const TransitionModel &trans_model, + fst::StdVectorFst *fst); + +/* +class LatticeFixerFst: + public fst::DeterministicOnDemandFst { + public: + typedef fst::StdArc::Weight Weight; + typedef fst::StdArc::StateId StateId; + typedef fst::StdArc::Label Label; + + LatticeFixerFst(const TransitionModel &trans_model): + trans_model_(trans_model) { } + + // We cannot use "const" because the pure virtual function in the interface is + // not const. + virtual StateId Start() { return 0; } + + virtual Weight Final(StateId s) { + return Weight::One(); + } + + // The ilabel is a transition-id; the state is interpreted as a frame-index. + // The olabel on oarc will be a pdf-id. The state-id is the time index 0 <= t + // <= num_frames. All transitions are to the next frame (but not all are + // allowed). The interface of GetArc requires ilabel to be nonzero (not + // epsilon). + virtual bool GetArc(StateId s, Label ilabel, fst::StdArc* oarc); + + private: + const TransitionModel &trans_model_; +}; +*/ } } diff --git a/src/chainbin/Makefile b/src/chainbin/Makefile index 51f97ff7c55..c1a4152dd15 100644 --- a/src/chainbin/Makefile +++ b/src/chainbin/Makefile @@ -11,7 +11,8 @@ BINFILES = chain-est-phone-lm chain-get-supervision chain-make-den-fst \ nnet3-chain-shuffle-egs nnet3-chain-subset-egs \ nnet3-chain-acc-lda-stats nnet3-chain-train nnet3-chain-compute-prob \ nnet3-chain-combine nnet3-chain-normalize-egs \ - nnet3-chain-split-and-get-egs chain-split-lattices + nnet3-chain-split-and-get-egs chain-split-lattices \ + nnet3-chain-split-convert-and-get-egs OBJFILES = diff --git a/src/chainbin/chain-split-lattices.cc b/src/chainbin/chain-split-lattices.cc index d8544cf6ba2..2e4c3232a25 100644 --- a/src/chainbin/chain-split-lattices.cc +++ b/src/chainbin/chain-split-lattices.cc @@ -41,19 +41,15 @@ namespace nnet3 { you should do it later with nnet3-chain-normalize-egs. */ -static bool ProcessFile(const chain::SupervisionOptions &sup_opts, - const chain::SupervisionLatticeSplitterOptions &sup_lat_splitter_opts, - const TransitionModel &trans_model, - const Lattice &lat, - const fst::StdVectorFst &tolerance_fst, +static bool ProcessFile(const chain::SupervisionLatticeSplitter &sup_lat_splitter, const std::string &utt_id, UtteranceSplitter *utt_splitter, TableWriter *fst_writer, - bool debug = true) { + LatticeWriter *lat_writer) { std::vector state_times; int32 frame_subsampling_factor = utt_splitter->Config().frame_subsampling_factor; - int32 num_frames = LatticeStateTimes(lat, &state_times) * frame_subsampling_factor; + int32 num_frames = sup_lat_splitter.NumFrames() * frame_subsampling_factor; std::vector chunks; @@ -66,9 +62,6 @@ static bool ProcessFile(const chain::SupervisionOptions &sup_opts, return false; } - chain::SupervisionLatticeSplitter sup_lat_splitter( - sup_lat_splitter_opts, trans_model, lat); - for (size_t c = 0; c < chunks.size(); c++) { ChunkTimeInfo &chunk = chunks[c]; @@ -76,22 +69,18 @@ static bool ProcessFile(const chain::SupervisionOptions &sup_opts, num_frames_subsampled = chunk.num_frames / frame_subsampling_factor; Lattice lat_part; - sup_lat_splitter.GetFrameRange(start_frame_subsampled, - num_frames_subsampled, - &lat_part); - - ScaleLattice(fst::LatticeScale(1.0, 0.0), &lat_part); - chain::Supervision supervision_part; - chain::PhoneLatticeToSupervision(tolerance_fst, - trans_model, lat_part, - &supervision_part, debug); + sup_lat_splitter.GetFrameRangeSupervision(start_frame_subsampled, + num_frames_subsampled, + &supervision_part, + &lat_part); std::ostringstream oss; oss << utt_id << "-" << start_frame_subsampled << "-" << num_frames_subsampled; std::string key = oss.str(); fst_writer->Write(key, supervision_part.fst); + lat_writer->Write(key, lat_part); } return true; } @@ -110,18 +99,16 @@ int main(int argc, char *argv[]) { "Split lattices to chain supervision FSTs\n" "\n" "Usage: chain-split-lattices [options] " - " \n"; + " []\n"; ExampleGenerationConfig eg_config; // controls num-frames, // left/right-context, etc. chain::SupervisionOptions sup_opts; int32 srand_seed = 0; - bool debug = true; ParseOptions po(usage); po.Register("srand", &srand_seed, "Seed for random number generator "); - po.Register("debug", &debug, "Get FST before projection"); eg_config.Register(&po); @@ -135,7 +122,7 @@ int main(int argc, char *argv[]) { srand(srand_seed); - if (po.NumArgs() != 3) { + if (po.NumArgs() != 3 && po.NumArgs() != 4) { po.PrintUsage(); exit(1); } @@ -146,6 +133,8 @@ int main(int argc, char *argv[]) { trans_model_rxfilename = po.GetArg(1); lattice_rspecifier = po.GetArg(2); fst_wspecifier = po.GetArg(3); + + std::string lattice_wspecifier = po.GetOptArg(4); eg_config.ComputeDerived(); UtteranceSplitter utt_splitter(eg_config); @@ -155,21 +144,21 @@ int main(int argc, char *argv[]) { SequentialLatticeReader lattice_reader(lattice_rspecifier); TableWriter fst_writer(fst_wspecifier); + LatticeWriter lattice_writer(lattice_wspecifier); int32 num_err = 0; - fst::StdVectorFst tolerance_fst; - MakeToleranceEnforcerFst(sup_opts, trans_model, &tolerance_fst); - if (GetVerboseLevel() > 3) WriteFstKaldi(KALDI_LOG, false, tolerance_fst); - - fst::ArcSort(&tolerance_fst, fst::ILabelCompare()); + chain::SupervisionLatticeSplitter sup_lat_splitter( + sup_lat_splitter_opts, sup_opts, trans_model); for (; !lattice_reader.Done(); lattice_reader.Next()) { std::string key = lattice_reader.Key(); const Lattice &lat = lattice_reader.Value(); - if (!ProcessFile(sup_opts, sup_lat_splitter_opts, - trans_model, lat, tolerance_fst, - key, &utt_splitter, &fst_writer, debug)) + + sup_lat_splitter.LoadLattice(lat); + if (!ProcessFile(sup_lat_splitter, + key, &utt_splitter, &fst_writer, + &lattice_writer)) num_err++; } if (num_err > 0) diff --git a/src/chainbin/nnet3-chain-get-egs.cc b/src/chainbin/nnet3-chain-get-egs.cc index b644ba0aa01..2287bedeb7b 100644 --- a/src/chainbin/nnet3-chain-get-egs.cc +++ b/src/chainbin/nnet3-chain-get-egs.cc @@ -99,6 +99,7 @@ static bool ProcessFile(const fst::StdVectorFst &normalization_fst, << (chunk.first_frame + chunk.num_frames) << ", FST was empty after composing with normalization FST. " << "This should be extremely rare (a few per corpus, at most)"; + return false; } int32 first_frame = 0; // we shift the time-indexes of all these parts so diff --git a/src/lat/lattice-functions.cc b/src/lat/lattice-functions.cc index 94c12343e5a..9e59318e7e1 100644 --- a/src/lat/lattice-functions.cc +++ b/src/lat/lattice-functions.cc @@ -388,6 +388,11 @@ BaseFloat LatticeForwardBackward(const Lattice &lat, Posterior *post, if (!ApproxEqual(tot_forward_prob, tot_backward_prob, 1e-8)) { KALDI_WARN << "Total forward probability over lattice = " << tot_forward_prob << ", while total backward probability = " << tot_backward_prob; + + if (!ApproxEqual(tot_forward_prob, tot_backward_prob, 1e-2)) { + KALDI_ERR << "Total forward probability over lattice = " << tot_forward_prob + << ", while total backward probability = " << tot_backward_prob; + } } // Now combine any posteriors with the same transition-id. for (int32 t = 0; t < max_time; t++) @@ -461,8 +466,10 @@ double ComputeLatticeAlphasAndBetas(const LatticeType &lat, typedef typename Arc::StateId StateId; StateId num_states = lat.NumStates(); - KALDI_ASSERT(lat.Properties(fst::kTopSorted, true) == fst::kTopSorted); + KALDI_ASSERT(lat.Properties(fst::kTopSorted, true) == fst::kTopSorted); KALDI_ASSERT(lat.Start() == 0); + alpha->clear(); + beta->clear(); alpha->resize(num_states, kLogZeroDouble); beta->resize(num_states, kLogZeroDouble); @@ -499,6 +506,11 @@ double ComputeLatticeAlphasAndBetas(const LatticeType &lat, if (!ApproxEqual(tot_forward_prob, tot_backward_prob, 1e-8)) { KALDI_WARN << "Total forward probability over lattice = " << tot_forward_prob << ", while total backward probability = " << tot_backward_prob; + + if (!ApproxEqual(tot_forward_prob, tot_backward_prob, 1e-2)) { + KALDI_ERR << "Total forward probability over lattice = " << tot_forward_prob + << ", while total backward probability = " << tot_backward_prob; + } } // Split the difference when returning... they should be the same. return 0.5 * (tot_backward_prob + tot_forward_prob); diff --git a/src/latbin/lattice-to-fst.cc b/src/latbin/lattice-to-fst.cc index 16ef0f60ce6..30687e86232 100644 --- a/src/latbin/lattice-to-fst.cc +++ b/src/latbin/lattice-to-fst.cc @@ -80,6 +80,7 @@ int main(int argc, char *argv[]) { BaseFloat lm_scale = 0.0; bool rm_eps = true, read_compact = true, convert_to_pdf_labels = false; std::string trans_model; + bool project_input = false, project_output = true; const char *usage = "Turn lattices into normal FSTs, retaining only the word labels\n" @@ -97,6 +98,12 @@ int main(int argc, char *argv[]) { "Convert lattice to pdf labels"); po.Register("trans-model", &trans_model, "Transition model"); + po.Register("project-input", &project_input, + "Project to input labels (transition-ids); applicable only " + "when --read-compact=false"); + po.Register("project-output", &project_output, + "Project to output labels (transition-ids); applicable only " + "when --read-compact=false"); po.Read(argc, argv); @@ -164,7 +171,10 @@ int main(int argc, char *argv[]) { } else { ConvertLattice(lat, &fst); } - Project(&fst, fst::PROJECT_INPUT); + if (project_input) + Project(&fst, fst::PROJECT_INPUT); + else if (project_output) + Project(&fst, fst::PROJECT_OUTPUT); if (rm_eps) RemoveEpsLocal(&fst); fst_writer.Write(key, fst); diff --git a/src/nnet3/nnet-chain-combine.cc b/src/nnet3/nnet-chain-combine.cc index 05e35422dd0..ed5e054855a 100644 --- a/src/nnet3/nnet-chain-combine.cc +++ b/src/nnet3/nnet-chain-combine.cc @@ -52,6 +52,7 @@ NnetChainCombiner::NnetChainCombiner(const NnetCombineConfig &combine_config, ComputeUpdatableComponentDims(); NnetComputeProbOptions compute_prob_opts; compute_prob_opts.compute_deriv = true; + compute_prob_opts.objective_scales_str = combine_config.objective_scales_str; prob_computer_ = new NnetChainComputeProb(compute_prob_opts, chain_config_, den_fst_, nnet_); } diff --git a/src/nnet3/nnet-chain-diagnostics.cc b/src/nnet3/nnet-chain-diagnostics.cc index c31c0ed90ac..01246e26499 100644 --- a/src/nnet3/nnet-chain-diagnostics.cc +++ b/src/nnet3/nnet-chain-diagnostics.cc @@ -68,6 +68,23 @@ NnetChainComputeProb::NnetChainComputeProb( sil_indices_.Resize(num_pdfs); sil_indices_.CopyFromVec(indices); } + + if (!nnet_config.objective_scales_str.empty()) { + std::vector objectives_for_outputs; + SplitStringToVector(nnet_config.objective_scales_str, ",", false, + &objectives_for_outputs); + std::vector::const_iterator it = objectives_for_outputs.begin(); + for (; it != objectives_for_outputs.end(); ++it) { + std::vector this_output_objective; + SplitStringToVector(*it, ":", false, + &this_output_objective); + + BaseFloat scale; + ConvertStringToReal(this_output_objective[1], &scale); + objective_scales_.insert( + std::make_pair(this_output_objective[0], scale)); + } + } } @@ -200,6 +217,18 @@ void NnetChainComputeProb::ProcessOutputs(const NnetChainExample &eg, &tot_like, &tot_l2_term, &tot_weight, (nnet_config_.compute_deriv ? &nnet_output_deriv : NULL), (use_xent ? &xent_deriv : NULL)); + + { + unordered_map::iterator it = + objective_scales_.find(sup.name); + + if (it != objective_scales_.end()) { + tot_like *= it->second; + tot_weight *= it->second; + if (nnet_config_.compute_deriv) + nnet_output_deriv.Scale(it->second); + } + } // note: in this context we don't want to apply 'sup.deriv_weights' because // this code is used only in combination, where it's part of an L-BFGS @@ -231,6 +260,14 @@ void NnetChainComputeProb::ProcessOutputs(const NnetChainExample &eg, // computation. note, xent_deriv has a factor of '.supervision.weight', // but so does tot_weight. BaseFloat xent_objf = TraceMatMat(xent_output, xent_deriv, kTrans); + unordered_map::iterator it = + objective_scales_.find(xent_name); + + if (it != objective_scales_.end()) { + xent_objf *= it->second; + xent_deriv.Scale(it->second); + } + xent_totals.tot_weight += tot_weight; xent_totals.tot_like += xent_objf; } diff --git a/src/nnet3/nnet-chain-diagnostics.h b/src/nnet3/nnet-chain-diagnostics.h index 5658446410c..65456e5ec8d 100644 --- a/src/nnet3/nnet-chain-diagnostics.h +++ b/src/nnet3/nnet-chain-diagnostics.h @@ -106,6 +106,8 @@ class NnetChainComputeProb { unordered_map objf_info_; CuArray sil_indices_; + + unordered_map objective_scales_; }; /// This function zeros the stored component-level stats in the nnet using diff --git a/src/nnet3/nnet-chain-training.cc b/src/nnet3/nnet-chain-training.cc index 6c6ecd1685a..70113b14aaa 100644 --- a/src/nnet3/nnet-chain-training.cc +++ b/src/nnet3/nnet-chain-training.cc @@ -79,6 +79,23 @@ NnetChainTrainer::NnetChainTrainer(const NnetChainTrainingOptions &opts, sil_indices_.Resize(num_pdfs); sil_indices_.CopyFromVec(indices); } + + if (!opts.nnet_config.objective_scales_str.empty()) { + std::vector objectives_for_outputs; + SplitStringToVector(opts.nnet_config.objective_scales_str, ",", false, + &objectives_for_outputs); + std::vector::const_iterator it = objectives_for_outputs.begin(); + for (; it != objectives_for_outputs.end(); ++it) { + std::vector this_output_objective; + SplitStringToVector(*it, ":", false, + &this_output_objective); + + BaseFloat scale; + ConvertStringToReal(this_output_objective[1], &scale); + objective_scales_.insert( + std::make_pair(this_output_objective[0], scale)); + } + } } @@ -220,6 +237,17 @@ void NnetChainTrainer::ProcessOutputs(bool is_backstitch_step2, (use_xent ? &xent_deriv : NULL)); } + { + unordered_map::iterator it = + objective_scales_.find(sup.name); + + if (it != objective_scales_.end()) { + tot_objf *= it->second; + tot_weight *= it->second; + nnet_output_deriv.Scale(it->second); + } + } + if (use_xent) { // this block computes the cross-entropy objective. const CuMatrixBase &xent_output = computer->GetOutput( @@ -227,6 +255,15 @@ void NnetChainTrainer::ProcessOutputs(bool is_backstitch_step2, // at this point, xent_deriv is posteriors derived from the numerator // computation. note, xent_objf has a factor of '.supervision.weight' BaseFloat xent_objf = TraceMatMat(xent_output, xent_deriv, kTrans); + + unordered_map::iterator it = + objective_scales_.find(xent_name); + + if (it != objective_scales_.end()) { + xent_objf *= it->second; + xent_deriv.Scale(it->second); + } + objf_info_[xent_name + suffix].UpdateStats(xent_name + suffix, opts_.nnet_config.print_interval, num_minibatches_processed_, diff --git a/src/nnet3/nnet-chain-training.h b/src/nnet3/nnet-chain-training.h index 9d57217a87c..9fe73f9c726 100644 --- a/src/nnet3/nnet-chain-training.h +++ b/src/nnet3/nnet-chain-training.h @@ -116,6 +116,8 @@ class NnetChainTrainer { int32 srand_seed_; CuArray sil_indices_; + + unordered_map objective_scales_; }; diff --git a/src/nnet3/nnet-combine.h b/src/nnet3/nnet-combine.h index 5b60d30b8ed..6a4491831fe 100644 --- a/src/nnet3/nnet-combine.h +++ b/src/nnet3/nnet-combine.h @@ -50,6 +50,7 @@ struct NnetCombineConfig { bool enforce_sum_to_one; BaseFloat sum_to_one_penalty; bool separate_weights_per_component; + std::string objective_scales_str; NnetCombineConfig(): num_iters(60), initial_impr(0.01), max_effective_inputs(15), @@ -83,6 +84,10 @@ struct NnetCombineConfig { po->Register("separate-weights-per-component", &separate_weights_per_component, "If true, have a separate weight for each updatable component in " "the nnet."); + po->Register("objective-scales", &objective_scales_str, + "Objective scales for the outputs specified as " + "a comma-separated list of pairs " + ":,:..."); } }; diff --git a/src/nnet3/nnet-diagnostics.h b/src/nnet3/nnet-diagnostics.h index d9ea3166a19..f0c905a3c3e 100644 --- a/src/nnet3/nnet-diagnostics.h +++ b/src/nnet3/nnet-diagnostics.h @@ -62,6 +62,7 @@ struct NnetComputeProbOptions { bool store_component_stats; bool compute_per_dim_accuracy; + std::string objective_scales_str; NnetOptimizeOptions optimize_config; NnetComputeOptions compute_config; @@ -83,6 +84,10 @@ struct NnetComputeProbOptions { "accuracy values as well as objective functions"); opts->Register("compute-per-dim-accuracy", &compute_per_dim_accuracy, "If true, compute accuracy values per-dim"); + opts->Register("objective-scales", &objective_scales_str, + "Objective scales for the outputs specified as " + "a comma-separated list of pairs " + ":,:..."); // register the optimization options with the prefix "optimization". ParseOptions optimization_opts("optimization", opts); diff --git a/src/nnet3/nnet-example-utils.cc b/src/nnet3/nnet-example-utils.cc index 5a0eebd9e9a..dadc64484a8 100644 --- a/src/nnet3/nnet-example-utils.cc +++ b/src/nnet3/nnet-example-utils.cc @@ -816,6 +816,43 @@ void UtteranceSplitter::GetGapSizes(int32 utterance_length, void UtteranceSplitter::GetChunksForUtterance( int32 utterance_length, std::vector *chunk_info) { + + if (config_.no_chunking) { + int32 min_diff = 100; + int32 len_extend_context = 0; + + for (std::vector::const_iterator it = config_.num_frames.begin(); + it != config_.num_frames.end(); ++it) { + if (abs(utterance_length - *it) < abs(min_diff)) + min_diff = utterance_length - *it; + } + + if (min_diff != 0) { + KALDI_WARN << "No exact match found for the length " << utterance_length + << " closest allowed length is off by " << min_diff + << " frames. Will try to fix it.."; + + if (abs(min_diff) < 5) // we assume possibly up to 5 frames from the end can be safely deleted + len_extend_context = -min_diff; // let the code below do it + else // unexpected + KALDI_ERR << "Too much length difference " << min_diff; + } + + chunk_info->resize(1); + ChunkTimeInfo &info = (*chunk_info)[0]; + + info.first_frame = 0; + info.num_frames = utterance_length + len_extend_context; + info.left_context = (config_.left_context_initial >= 0 ? + config_.left_context_initial : config_.left_context); + info.right_context = (config_.right_context_final >= 0 ? + config_.right_context_final : config_.right_context); + + SetOutputWeights(utterance_length, chunk_info); + AccStatsForUtterance(utterance_length, *chunk_info); + return; + } + std::vector chunk_sizes; GetChunkSizesForUtterance(utterance_length, &chunk_sizes); std::vector gaps(chunk_sizes.size()); diff --git a/src/nnet3/nnet-example-utils.h b/src/nnet3/nnet-example-utils.h index 3dcd90eb980..6c7ee0a85d8 100644 --- a/src/nnet3/nnet-example-utils.h +++ b/src/nnet3/nnet-example-utils.h @@ -87,6 +87,7 @@ struct ExampleGenerationConfig { int32 num_frames_overlap; int32 frame_subsampling_factor; std::string num_frames_str; + bool no_chunking; // The following parameters are derived parameters, computed by @@ -101,7 +102,7 @@ struct ExampleGenerationConfig { left_context(0), right_context(0), left_context_initial(-1), right_context_final(-1), num_frames_overlap(0), frame_subsampling_factor(1), - num_frames_str("1") { } + num_frames_str("1"), no_chunking(false) { } /// This function decodes 'num_frames_str' into 'num_frames', and ensures that /// the members of 'num_frames' are multiples of 'frame_subsampling_factor'. @@ -140,6 +141,9 @@ struct ExampleGenerationConfig { po->Register("frame-subsampling-factor", &frame_subsampling_factor, "Used " "if the frame-rate of the output labels in the generated " "examples will be less than the frame-rate at the input"); + po->Register("no-chunking", &no_chunking, "If set to true, then the " + "whole utterance will be used and there will be no " + "chunking"); } }; diff --git a/src/nnet3/nnet-training.h b/src/nnet3/nnet-training.h index 4dc6f667a64..02ce0ff550a 100644 --- a/src/nnet3/nnet-training.h +++ b/src/nnet3/nnet-training.h @@ -42,6 +42,7 @@ struct NnetTrainerOptions { std::string write_cache; bool binary_write_cache; BaseFloat max_param_change; + std::string objective_scales_str; NnetOptimizeOptions optimize_config; NnetComputeOptions compute_config; CachingOptimizingCompilerOptions compiler_config; @@ -88,6 +89,10 @@ struct NnetTrainerOptions { "write the cached computation to"); opts->Register("binary-write-cache", &binary_write_cache, "Write " "computation cache in binary mode"); + opts->Register("objective-scales", &objective_scales_str, + "Objective scales for the outputs specified as " + "a comma-separated list of pairs " + ":,:..."); // register the optimization options with the prefix "optimization". ParseOptions optimization_opts("optimization", opts); diff --git a/src/online2bin/extend-wav-with-silence.cc b/src/online2bin/extend-wav-with-silence.cc index ce4e3ef904c..14b16ea054d 100644 --- a/src/online2bin/extend-wav-with-silence.cc +++ b/src/online2bin/extend-wav-with-silence.cc @@ -76,16 +76,47 @@ int main(int argc, char *argv[]) { exit(1); } - std::string wav_rspecifier = po.GetArg(1); - std::string wav_wspecifier = po.GetArg(2); + if (ClassifyRspecifier(po.GetArg(1), NULL, NULL) != kNoRspecifier) { + SequentialTableReader reader(po.GetArg(1)); + TableWriter writer(po.GetArg(2)); + int32 num_success = 0; + + for(; !reader.Done(); reader.Next()){ + std::string wav_key = reader.Key(); + const WaveData &wave = reader.Value(); + BaseFloat samp_freq = wave.SampFreq(); // read sampling fequency + const Matrix &wave_data = wave.Data(); + int32 num_chan = wave_data.NumRows(), // number of channels in recording + num_ext_samp = (int32)(samp_freq * sil_len); // number of samples that will be extended + KALDI_ASSERT(num_ext_samp > 0); + Matrix new_wave(wave_data.NumRows(), wave_data.NumCols() + num_ext_samp); + for(int32 i = 0; i < num_chan; i++){ + Vector wav_this_chan(wave_data.Row(i)); + Vector wav_extend(wav_this_chan.Dim() + num_ext_samp); + ExtendWaveWithSilence(wav_this_chan, samp_freq, &wav_extend, + sil_search_len, sil_extract_len, sil_extract_shift); + KALDI_ASSERT(wav_extend.Dim() == wav_this_chan.Dim() + num_ext_samp); + new_wave.CopyRowFromVec(wav_extend, i); + } + WaveData wave_out(samp_freq, new_wave); + writer.Write(wav_key, wave_out); + num_success++; + } + KALDI_LOG << "Successfully extended " << num_success << " files."; + return 0; + } else { + std::string wav_rxfilename = po.GetArg(1); + std::string wav_wxfilename = po.GetArg(2); + bool binary = true; + Input ki(wav_rxfilename, &binary); + WaveHolder wh; + if (!wh.Read(ki.Stream())) { + KALDI_ERR << "Read failure from " + << PrintableRxfilename(wav_rxfilename); + } - SequentialTableReader reader(wav_rspecifier); - TableWriter writer(wav_wspecifier); - int32 num_success = 0; + const WaveData& wave = wh.Value(); - for(; !reader.Done(); reader.Next()){ - std::string wav_key = reader.Key(); - const WaveData &wave = reader.Value(); BaseFloat samp_freq = wave.SampFreq(); // read sampling fequency const Matrix &wave_data = wave.Data(); int32 num_chan = wave_data.NumRows(), // number of channels in recording @@ -101,11 +132,14 @@ int main(int argc, char *argv[]) { new_wave.CopyRowFromVec(wav_extend, i); } WaveData wave_out(samp_freq, new_wave); - writer.Write(wav_key, wave_out); - num_success++; + + Output ko(wav_wxfilename, binary, false); + if (!WaveHolder::Write(ko.Stream(), true, wave_out)) { + KALDI_ERR << "Write failure to " + << PrintableWxfilename(wav_wxfilename); + } + // we do not print any log messages here } - KALDI_LOG << "Successfully extended " << num_success << " files."; - return 0; } catch(const std::exception &e) { std::cerr << e.what(); return -1; @@ -142,7 +176,7 @@ void ExtendWaveWithSilence(const Vector &wav_in, wav_out->Dim() - wav_in.Dim() + window_size_half); for(int32 i = 0; i < window_size_half; i++) // windowing the first half window wav_ext(i) *= half_window(i); - + int32 tmp_offset = 0; for(; tmp_offset + window_size < wav_ext.Dim();) { wav_ext.Range(tmp_offset, window_size).AddVec(1.0, windowed_silence); From 339c435f7c6e68f1f4bd50d780b2dcef2a18a6f3 Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Wed, 11 Oct 2017 18:26:29 -0400 Subject: [PATCH 072/174] Old tolerance approach --- src/chainbin/nnet3-chain-split-and-get-egs.cc | 391 ++++++++++++++++++ 1 file changed, 391 insertions(+) create mode 100644 src/chainbin/nnet3-chain-split-and-get-egs.cc diff --git a/src/chainbin/nnet3-chain-split-and-get-egs.cc b/src/chainbin/nnet3-chain-split-and-get-egs.cc new file mode 100644 index 00000000000..800ef52861d --- /dev/null +++ b/src/chainbin/nnet3-chain-split-and-get-egs.cc @@ -0,0 +1,391 @@ +// chainbin/nnet3-chain-get-egs.cc + +// Copyright 2015 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "base/kaldi-common.h" +#include "util/common-utils.h" +#include "hmm/transition-model.h" +#include "hmm/posterior.h" +#include "chain/chain-supervision-splitter.h" +#include "lat/lattice-functions.h" +#include "nnet3/nnet-example.h" +#include "nnet3/nnet-chain-example.h" +#include "nnet3/nnet-example-utils.h" + +namespace kaldi { +namespace nnet3 { + + +/** + This function does all the processing for one utterance, and outputs the + supervision objects to 'example_writer'. Note: if normalization_fst is the + empty FST (with no states), it skips the final stage of egs preparation and + you should do it later with nnet3-chain-normalize-egs. +*/ + +static bool ProcessFile(const chain::SupervisionOptions &sup_opts, + const fst::StdVectorFst &normalization_fst, + const GeneralMatrix &feats, + const MatrixBase *ivector_feats, + int32 ivector_period, + const TransitionModel &trans_model, + const chain::SupervisionLatticeSplitter &sup_lat_splitter, + const VectorBase *deriv_weights, + int32 supervision_length_tolerance, + const std::string &utt_id, + bool compress, + UtteranceSplitter *utt_splitter, + NnetChainExampleWriter *example_writer) { + + int32 num_input_frames = feats.NumRows(); + + std::vector state_times; + + int32 frame_subsampling_factor = utt_splitter->Config().frame_subsampling_factor; + int32 num_output_frames = sup_lat_splitter.NumFrames(); + + if (deriv_weights && (std::abs(deriv_weights->Dim() - num_output_frames) + > supervision_length_tolerance)) { + KALDI_WARN << "For utterance " << utt_id + << ", mismatch between deriv-weights dim and num-output-frames" + << "; " << deriv_weights->Dim() << " vs " << num_output_frames; + return false; + } + + if (!utt_splitter->LengthsMatch(utt_id, num_input_frames, num_output_frames, + supervision_length_tolerance)) + return false; // LengthsMatch() will have printed a warning. + + std::vector chunks; + + utt_splitter->GetChunksForUtterance(num_input_frames, &chunks); + + if (chunks.empty()) { + KALDI_WARN << "Not producing egs for utterance " << utt_id + << " because it is too short: " + << num_input_frames << " frames."; + return false; + } + + for (size_t c = 0; c < chunks.size(); c++) { + ChunkTimeInfo &chunk = chunks[c]; + + int32 start_frame_subsampled = chunk.first_frame / frame_subsampling_factor, + num_frames_subsampled = chunk.num_frames / frame_subsampling_factor; + + + chain::Supervision supervision_part; + sup_lat_splitter.GetFrameRangeSupervision(start_frame_subsampled, + num_frames_subsampled, + &supervision_part); + + if (normalization_fst.NumStates() > 0 && + !chain::AddWeightToSupervisionFst(normalization_fst, + &supervision_part)) { + KALDI_WARN << "For utterance " << utt_id << ", feature frames " + << chunk.first_frame << " to " + << (chunk.first_frame + chunk.num_frames) + << ", FST was empty after composing with normalization FST. " + << "This should be extremely rare (a few per corpus, at most)"; + return false; + } + + int32 first_frame = 0; // we shift the time-indexes of all these parts so + // that the supervised part starts from frame 0. + + NnetChainExample nnet_chain_eg; + nnet_chain_eg.outputs.resize(1); + + SubVector output_weights( + &(chunk.output_weights[0]), + static_cast(chunk.output_weights.size())); + + if (!deriv_weights) { + NnetChainSupervision nnet_supervision("output", supervision_part, + output_weights, + first_frame, + frame_subsampling_factor); + nnet_chain_eg.outputs[0].Swap(&nnet_supervision); + } else { + Vector this_deriv_weights(num_frames_subsampled); + for (int32 i = 0; i < num_frames_subsampled; i++) { + int32 t = i + start_frame_subsampled; + if (t < deriv_weights->Dim()) + this_deriv_weights(i) = (*deriv_weights)(t); + } + KALDI_ASSERT(output_weights.Dim() == num_frames_subsampled); + this_deriv_weights.MulElements(output_weights); + NnetChainSupervision nnet_supervision("output", supervision_part, + this_deriv_weights, + first_frame, + frame_subsampling_factor); + nnet_chain_eg.outputs[0].Swap(&nnet_supervision); + } + + nnet_chain_eg.inputs.resize(ivector_feats != NULL ? 2 : 1); + + int32 tot_input_frames = chunk.left_context + chunk.num_frames + + chunk.right_context, + start_frame = chunk.first_frame - chunk.left_context; + + GeneralMatrix input_frames; + ExtractRowRangeWithPadding(feats, start_frame, tot_input_frames, + &input_frames); + + NnetIo input_io("input", -chunk.left_context, input_frames); + nnet_chain_eg.inputs[0].Swap(&input_io); + + if (ivector_feats != NULL) { + // if applicable, add the iVector feature. + // choose iVector from a random frame in the chunk + int32 ivector_frame = RandInt(start_frame, + start_frame + num_input_frames - 1), + ivector_frame_subsampled = ivector_frame / ivector_period; + if (ivector_frame_subsampled < 0) + ivector_frame_subsampled = 0; + if (ivector_frame_subsampled >= ivector_feats->NumRows()) + ivector_frame_subsampled = ivector_feats->NumRows() - 1; + Matrix ivector(1, ivector_feats->NumCols()); + ivector.Row(0).CopyFromVec(ivector_feats->Row(ivector_frame_subsampled)); + NnetIo ivector_io("ivector", 0, ivector); + nnet_chain_eg.inputs[1].Swap(&ivector_io); + } + + if (compress) + nnet_chain_eg.Compress(); + + std::ostringstream os; + os << utt_id << "-" << chunk.first_frame; + + std::string key = os.str(); // key is - + + example_writer->Write(key, nnet_chain_eg); + } + return true; +} + +} // namespace nnet3 +} // namespace kaldi + +int main(int argc, char *argv[]) { + try { + using namespace kaldi; + using namespace kaldi::nnet3; + typedef kaldi::int32 int32; + typedef kaldi::int64 int64; + + const char *usage = + "Get frame-by-frame examples of data for nnet3+chain neural network\n" + "training. This involves breaking up utterances into pieces of a\n" + "fixed size. Input will come from non-compact phone lattice.\n" + "Note: if is not supplied the egs will not be\n" + "ready for training; in that case they should later be processed\n" + "with nnet3-chain-normalize-egs\n" + "\n" + "Usage: nnet3-chain-split-and-get-egs [options] [] " + " \n" + "\n" + "An example [where $feats expands to the actual features]:\n" + "lattice-copy --write-compact=false ark:1.lat ark:- | \\ \n" + " nnet3-chain-split-and-get-egs --left-context=25 --right-context=9 --num-frames=20 dir/normalization.fst \\\n" + " \"$feats\" dir/tree dir/0.trans_mdl ark,s,cs:- ark:cegs.1.ark\n"; + + bool compress = true; + int32 length_tolerance = 100, online_ivector_period = 1, + supervision_length_tolerance = 1; + + ExampleGenerationConfig eg_config; // controls num-frames, + // left/right-context, etc. + chain::SupervisionOptions sup_opts; + + int32 srand_seed = 0; + std::string online_ivector_rspecifier, deriv_weights_rspecifier; + + ParseOptions po(usage); + po.Register("compress", &compress, "If true, write egs with input features " + "in compressed format (recommended). Update: this is now " + "only relevant if the features being read are un-compressed; " + "if already compressed, we keep we same compressed format when " + "dumping-egs."); + po.Register("ivectors", &online_ivector_rspecifier, "Alias for " + "--online-ivectors option, for back compatibility"); + po.Register("online-ivectors", &online_ivector_rspecifier, "Rspecifier of " + "ivector features, as a matrix."); + po.Register("online-ivector-period", &online_ivector_period, "Number of " + "frames between iVectors in matrices supplied to the " + "--online-ivectors option"); + po.Register("srand", &srand_seed, "Seed for random number generator "); + po.Register("length-tolerance", &length_tolerance, "Tolerance for " + "difference in num-frames between feat and ivector matrices"); + po.Register("supervision-length-tolerance", &supervision_length_tolerance, "Tolerance for " + "difference in num-frames-subsampled between supervision and deriv weights"); + po.Register("deriv-weights-rspecifier", &deriv_weights_rspecifier, + "Per-frame weights (only binary - 0 or 1) that specifies " + "whether a frame's gradient must be backpropagated or not. " + "Not specifying this is equivalent to specifying a vector of " + "all 1s."); + + eg_config.Register(&po); + + ParseOptions supervision_opts("supervision", &po); + sup_opts.Register(&supervision_opts); + + chain::SupervisionLatticeSplitterOptions sup_lat_splitter_opts; + sup_lat_splitter_opts.Register(&po); + + po.Read(argc, argv); + + srand(srand_seed); + + if (po.NumArgs() < 5 || po.NumArgs() > 6) { + po.PrintUsage(); + exit(1); + } + + std::string + normalization_fst_rxfilename, + feature_rspecifier, + tree_rxfilename, trans_model_rxfilename, + lattice_rspecifier, + examples_wspecifier; + if (po.NumArgs() == 5) { + feature_rspecifier = po.GetArg(1); + tree_rxfilename = po.GetArg(2); + trans_model_rxfilename = po.GetArg(3); + lattice_rspecifier = po.GetArg(4); + examples_wspecifier = po.GetArg(5); + } else { + normalization_fst_rxfilename = po.GetArg(1); + KALDI_ASSERT(!normalization_fst_rxfilename.empty()); + feature_rspecifier = po.GetArg(2); + tree_rxfilename = po.GetArg(3); + trans_model_rxfilename = po.GetArg(4); + lattice_rspecifier = po.GetArg(5); + examples_wspecifier = po.GetArg(6); + } + + eg_config.ComputeDerived(); + UtteranceSplitter utt_splitter(eg_config); + + fst::StdVectorFst normalization_fst; + if (!normalization_fst_rxfilename.empty()) { + ReadFstKaldi(normalization_fst_rxfilename, &normalization_fst); + KALDI_ASSERT(normalization_fst.NumStates() > 0); + + if (sup_opts.lm_scale < 0.0 || sup_opts.lm_scale > 1.0) { + KALDI_ERR << "Invalid lm-scale; must be in [0.0, 1.0)"; + } + + if (sup_opts.lm_scale != 0.0) { + ScaleFst(1.0 - sup_opts.lm_scale, &normalization_fst); + } + } + + // Read as GeneralMatrix so we don't need to un-compress and re-compress + // when selecting parts of matrices. + SequentialGeneralMatrixReader feat_reader(feature_rspecifier); + + TransitionModel trans_model; + ReadKaldiObject(trans_model_rxfilename, &trans_model); + + ContextDependency ctx_dep; + ReadKaldiObject(tree_rxfilename, &ctx_dep); + + RandomAccessLatticeReader lattice_reader( + lattice_rspecifier); + NnetChainExampleWriter example_writer(examples_wspecifier); + RandomAccessBaseFloatMatrixReader online_ivector_reader( + online_ivector_rspecifier); + RandomAccessBaseFloatVectorReader deriv_weights_reader( + deriv_weights_rspecifier); + + int32 num_err = 0; + + KALDI_ASSERT(sup_lat_splitter_opts.add_tolerance_to_lat && + sup_opts.frame_subsampling_factor == 1); + + chain::SupervisionLatticeSplitter sup_lat_splitter( + sup_lat_splitter_opts, sup_opts, trans_model); + + for (; !feat_reader.Done(); feat_reader.Next()) { + std::string key = feat_reader.Key(); + const GeneralMatrix &feats = feat_reader.Value(); + if (!lattice_reader.HasKey(key)) { + KALDI_WARN << "No lattice for key " << key; + num_err++; + } else { + const Lattice &lat = lattice_reader.Value(key); + const Matrix *online_ivector_feats = NULL; + if (!online_ivector_rspecifier.empty()) { + if (!online_ivector_reader.HasKey(key)) { + KALDI_WARN << "No iVectors for utterance " << key; + num_err++; + continue; + } else { + // this address will be valid until we call HasKey() or Value() + // again. + online_ivector_feats = &(online_ivector_reader.Value(key)); + } + } + if (online_ivector_feats != NULL && + (abs(feats.NumRows() - (online_ivector_feats->NumRows() * + online_ivector_period)) > length_tolerance + || online_ivector_feats->NumRows() == 0)) { + KALDI_WARN << "Length difference between feats " << feats.NumRows() + << " and iVectors " << online_ivector_feats->NumRows() + << "exceeds tolerance " << length_tolerance; + num_err++; + continue; + } + + const Vector *deriv_weights = NULL; + if (!deriv_weights_rspecifier.empty()) { + if (!deriv_weights_reader.HasKey(key)) { + KALDI_WARN << "No deriv weights for utterance " << key; + num_err++; + continue; + } else { + // this address will be valid until we call HasKey() or Value() + // again. + deriv_weights = &(deriv_weights_reader.Value(key)); + } + } + + sup_lat_splitter.LoadLattice(lat); + + if (!ProcessFile(sup_opts, normalization_fst, feats, + online_ivector_feats, online_ivector_period, + trans_model, sup_lat_splitter, + deriv_weights, supervision_length_tolerance, + key, compress, + &utt_splitter, &example_writer)) + num_err++; + } + } + if (num_err > 0) + KALDI_WARN << num_err << " utterances had errors and could " + "not be processed."; + // utt_splitter prints stats in its destructor. + return utt_splitter.ExitStatus(); + } catch(const std::exception &e) { + std::cerr << e.what() << '\n'; + return -1; + } +} From e90ca235e819a40aa37e6cd8c19672c179ae7d44 Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Mon, 16 Oct 2017 18:33:00 -0400 Subject: [PATCH 073/174] semisup: adding mbr supervision --- .../run_tdnn_15k_semisupervised_conf_q.sh | 2 +- .../run_tdnn_15k_semisupervised_conf_s.sh | 8 +- egs/wsj/s5/steps/nnet3/chain/get_egs.sh | 62 +++-- egs/wsj/s5/steps/nnet3/chain/get_egs_split.sh | 14 +- .../steps/nnet3/multilingual/combine_egs.sh | 3 +- .../data/perturb_speed_to_allowed_lengths.py | 90 ++++--- src/chain/chain-supervision-splitter-test.cc | 90 +++++-- src/chain/chain-supervision-splitter.cc | 231 +++++++++--------- src/chain/chain-supervision-splitter.h | 2 + src/chain/chain-supervision.cc | 161 +++++++++++- src/chain/chain-supervision.h | 16 +- src/chainbin/nnet3-chain-normalize-egs.cc | 4 +- 12 files changed, 463 insertions(+), 220 deletions(-) diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_q.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_q.sh index 3c452cbde83..d587196ce9f 100644 --- a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_q.sh +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_q.sh @@ -48,7 +48,7 @@ extra_right_context=0 xent_regularize=0.1 hidden_dim=725 -minibatch_size="150=128,64/300=100,64,32/600=50,32,16/1200=16,8" +minibatch_size="150=128,64/300=64,32/600=32,16,8/1200=16,8,4" # to tune: # frames_per_eg for unsupervised diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_s.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_s.sh index ab78062f89d..5c991e7770c 100644 --- a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_s.sh +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_s.sh @@ -24,7 +24,7 @@ tdnn_affix=7b # affix for the supervised chain-model directory train_supervised_opts="--stage -10 --train-stage -10" # Unsupervised options -decode_affix= +decode_affix=_unphdet egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices @@ -165,6 +165,12 @@ fi dir=$exp/chain${nnet3_affix}/tdnn${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} if [ $stage -le 10 ]; then + steps/subset_ali_dir.sh --cmd "$train_cmd" \ + data/${unsupervised_set} data/${unsupervised_set}_sp_hires \ + $chaindir/best_path_${unsupervised_set}_sp${decode_affix} \ + $chaindir/best_path_${unsupervised_set}${decode_affix} + echo $frame_subsampling_factor > $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ $dir diff --git a/egs/wsj/s5/steps/nnet3/chain/get_egs.sh b/egs/wsj/s5/steps/nnet3/chain/get_egs.sh index 749059f3475..196c1f9d22b 100755 --- a/egs/wsj/s5/steps/nnet3/chain/get_egs.sh +++ b/egs/wsj/s5/steps/nnet3/chain/get_egs.sh @@ -78,6 +78,10 @@ phone_insertion_penalty= deriv_weights_scp= generate_egs_scp=false no_chunking=false +use_mbr_decode=false +arc_scale=1.0 +keep_only_best_path=false +lat_copy_src= echo "$0 $@" # Print the command line for logging @@ -293,15 +297,23 @@ fi if [ $stage -le 2 ]; then echo "$0: copying training lattices" - [ ! -z $lattice_prune_beam ] && \ - prune_cmd="ark:- | lattice-prune --acoustic-scale=$acwt --beam=$lattice_prune_beam ark:-" - $cmd --max-jobs-run 6 JOB=1:$num_lat_jobs $dir/log/lattice_copy.JOB.log \ - lattice-copy "ark:gunzip -c $latdir/lat.JOB.gz|" $prune_cmd ark,scp:$dir/lat.JOB.ark,$dir/lat.JOB.scp || exit 1; + if [ -z "$lat_copy_src" ]; then + if ! $keep_only_best_path; then + $cmd --max-jobs-run 6 JOB=1:$num_lat_jobs $dir/log/lattice_copy.JOB.log \ + lattice-copy "ark:gunzip -c $latdir/lat.JOB.gz|" ark,scp:$dir/lat.JOB.ark,$dir/lat.JOB.scp || exit 1; + else + $cmd --max-jobs-run 6 JOB=1:$num_lat_jobs $dir/log/lattice_copy.JOB.log \ + lattice-interp --alpha=1 "ark:gunzip -c $latdir/lat.JOB.gz|" \ + "ark:gunzip -c $latdir/lat.JOB.gz | lattice-1best --acoustic-scale=$acwt ark:- ark:- |" \ + ark,scp:$dir/lat.JOB.ark,$dir/lat.JOB.scp || exit 1; + fi - for id in $(seq $num_lat_jobs); do cat $dir/lat.$id.scp; done > $dir/lat.scp + for id in $(seq $num_lat_jobs); do cat $dir/lat.$id.scp; done > $dir/lat.scp + else + ln -s `readlink -f $lat_copy_src`/lat.*.{ark,scp} $dir/ + fi fi - egs_opts="--left-context=$left_context --right-context=$right_context --num-frames=$frames_per_eg --frame-subsampling-factor=$frame_subsampling_factor --compress=$compress" [ $left_context_initial -ge 0 ] && egs_opts="$egs_opts --left-context-initial=$left_context_initial" [ $right_context_final -ge 0 ] && egs_opts="$egs_opts --right-context-final=$right_context_final" @@ -317,14 +329,30 @@ chain_supervision_all_opts="--lattice-input=true --frame-subsampling-factor=$ali chain_supervision_all_opts="$chain_supervision_all_opts --left-tolerance=$left_tolerance" normalization_scale=1.0 -if [ ! -z "$lattice_lm_scale" ]; then - chain_supervision_all_opts="$chain_supervision_all_opts --lm-scale=$lattice_lm_scale" - normalization_scale=$(perl -e " - if ($lattice_lm_scale >= 1.0 || $lattice_lm_scale < 0) { - print STDERR \"Invalid --lattice-lm-scale $lattice_lm_scale\"; - exit(1); - } - print (1.0 - $lattice_lm_scale);") + +lattice_copy_cmd="ark:-" +if [ ! -z $lattice_prune_beam ]; then + if [ "$lattice_prune_beam" == "0" ] || [ "$lattice_prune_beam" == "0.0" ]; then + lattice_copy_cmd="ark:- | lattice-1best --acoustic-scale=$acwt ark:- ark:-" + else + lattice_copy_cmd="ark:- | lattice-prune --acoustic-scale=$acwt --beam=$lattice_prune_beam ark:- ark:-" + fi +fi + +if ! $use_mbr_decode; then + if [ ! -z "$lattice_lm_scale" ]; then + chain_supervision_all_opts="$chain_supervision_all_opts --lm-scale=$lattice_lm_scale" + + normalization_scale=$(perl -e " + if ($lattice_lm_scale >= 1.0 || $lattice_lm_scale < 0) { + print STDERR \"Invalid --lattice-lm-scale $lattice_lm_scale\"; + exit(1); + } + print (1.0 - $lattice_lm_scale);") + fi +else + chain_supervision_all_opts="$chain_supervision_all_opts --arc-scale=$arc_scale --use-mbr-decode" + lattice_copy_cmd="$lattice_copy_cmd | lattice-scale --acoustic-scale=$acwt ark:- ark:-" fi [ ! -z $phone_insertion_penalty ] && \ @@ -356,7 +384,7 @@ if [ $stage -le 3 ]; then $cmd $dir/log/create_valid_subset.log \ utils/filter_scp.pl $dir/valid_uttlist $dir/lat_special.scp \| \ - lattice-align-phones --replace-output-symbols=true $latdir/final.mdl scp:- ark:- \| \ + lattice-align-phones --replace-output-symbols=true $latdir/final.mdl scp:- $lattice_copy_cmd \| \ chain-get-supervision $chain_supervision_all_opts $chaindir/tree $chaindir/0.trans_mdl \ ark:- ark:- \| \ nnet3-chain-get-egs $ivector_opts --srand=$srand \ @@ -364,7 +392,7 @@ if [ $stage -le 3 ]; then "$valid_feats" ark,s,cs:- "ark:$dir/valid_all.cegs" || touch $dir/.error & $cmd $dir/log/create_train_subset.log \ utils/filter_scp.pl $dir/train_subset_uttlist $dir/lat_special.scp \| \ - lattice-align-phones --replace-output-symbols=true $latdir/final.mdl scp:- ark:- \| \ + lattice-align-phones --replace-output-symbols=true $latdir/final.mdl scp:- $lattice_copy_cmd \| \ chain-get-supervision $chain_supervision_all_opts \ $chaindir/tree $chaindir/0.trans_mdl ark:- ark:- \| \ nnet3-chain-get-egs $ivector_opts --srand=$srand \ @@ -429,7 +457,7 @@ if [ $stage -le 4 ]; then # quite large. $cmd JOB=1:$nj $dir/log/get_egs.JOB.log \ utils/filter_scp.pl $sdata/JOB/utt2spk $dir/lat.scp \| \ - lattice-align-phones --replace-output-symbols=true $latdir/final.mdl scp:- ark:- \| \ + lattice-align-phones --replace-output-symbols=true $latdir/final.mdl scp:- $lattice_copy_cmd \| \ chain-get-supervision $chain_supervision_all_opts \ --weight=$egs_weight \ $chaindir/tree $chaindir/0.trans_mdl ark:- ark:- \| \ diff --git a/egs/wsj/s5/steps/nnet3/chain/get_egs_split.sh b/egs/wsj/s5/steps/nnet3/chain/get_egs_split.sh index ada04aef31c..89211621fa9 100755 --- a/egs/wsj/s5/steps/nnet3/chain/get_egs_split.sh +++ b/egs/wsj/s5/steps/nnet3/chain/get_egs_split.sh @@ -276,15 +276,12 @@ fi if [ $stage -le 2 ]; then echo "$0: copying training lattices" - [ ! -z $lattice_prune_beam ] && \ - prune_cmd="ark:- | lattice-prune --write-compact=false --acoustic-scale=$acwt --beam=$lattice_prune_beam ark:-" $cmd --max-jobs-run 6 JOB=1:$num_lat_jobs $dir/log/lattice_copy.JOB.log \ - lattice-copy --write-compact=false "ark:gunzip -c $latdir/lat.JOB.gz|" $prune_cmd ark,scp:$dir/lat.JOB.ark,$dir/lat.JOB.scp || exit 1; + lattice-copy --write-compact=false "ark:gunzip -c $latdir/lat.JOB.gz|" ark,scp:$dir/lat.JOB.ark,$dir/lat.JOB.scp || exit 1; for id in $(seq $num_lat_jobs); do cat $dir/lat.$id.scp; done > $dir/lat.scp fi - egs_opts="--left-context=$left_context --right-context=$right_context --num-frames=$frames_per_eg --frame-subsampling-factor=$frame_subsampling_factor --compress=$compress" [ $left_context_initial -ge 0 ] && egs_opts="$egs_opts --left-context-initial=$left_context_initial" [ $right_context_final -ge 0 ] && egs_opts="$egs_opts --right-context-final=$right_context_final" @@ -302,7 +299,7 @@ normalization_scale=1.0 if [ ! -z "$lattice_lm_scale" ]; then chain_supervision_all_opts="$chain_supervision_all_opts --supervision.lm-scale=$lattice_lm_scale" normalization_scale=$(perl -e " - if ($lattice_lm_scale >= 1.0 || $lattice_lm_scale < 0) { + if ($lattice_lm_scale > 1.0 || $lattice_lm_scale < 0) { print STDERR \"Invalid --lattice-lm-scale $lattice_lm_scale\"; exit(1); } @@ -328,6 +325,11 @@ echo $left_context > $dir/info/left_context echo $right_context > $dir/info/right_context echo $left_context_initial > $dir/info/left_context_initial echo $right_context_final > $dir/info/right_context_final + +lattice_copy_cmd="ark:-" + +[ ! -z $lattice_prune_beam ] && \ + lattice_copy_cmd="ark:- | lattice-prune --write-compact=false --acoustic-scale=$acwt --beam=$lattice_prune_beam ark:- ark:-" if [ $stage -le 3 ]; then echo "$0: Getting validation and training subset examples." @@ -340,7 +342,7 @@ if [ $stage -le 3 ]; then $cmd $dir/log/create_valid_subset.log \ utils/filter_scp.pl $dir/valid_uttlist $dir/lat_special.scp \| \ - lattice-align-phones --write-compact=false --replace-output-symbols=true $latdir/final.mdl scp:- ark:- \| \ + lattice-align-phones --write-compact=false --replace-output-symbols=true $latdir/final.mdl scp:- $lattice_copy_cmd \| \ nnet3-chain-split-and-get-egs $chain_supervision_all_opts $ivector_opts --srand=$srand \ $egs_opts $chaindir/normalization.fst \ "$valid_feats" $chaindir/tree $chaindir/0.trans_mdl \ diff --git a/egs/wsj/s5/steps/nnet3/multilingual/combine_egs.sh b/egs/wsj/s5/steps/nnet3/multilingual/combine_egs.sh index 2fc6357aa46..f899a50e58f 100755 --- a/egs/wsj/s5/steps/nnet3/multilingual/combine_egs.sh +++ b/egs/wsj/s5/steps/nnet3/multilingual/combine_egs.sh @@ -86,7 +86,8 @@ for lang in $(seq 0 $[$num_langs-1]);do valid_diagnostic_scp_list="$valid_diagnostic_scp_list ${args[$lang]}/valid_diagnostic.scp" combine_scp_list="$combine_scp_list ${args[$lang]}/combine.scp" - this_frames_per_eg=$(cat ${args[$lang]}/info/frames_per_eg | cut -d, -f 1) # use only the primary frames-per-eg + this_frames_per_eg=$(cat ${args[$lang]}/info/frames_per_eg | \ + awk -F, '{for (i=1; i<=NF; i++) sum += $i;} END{print int(sum / NF)}') # use average frames-per-eg if [ $lang -eq 0 ]; then frames_per_eg_list="$this_frames_per_eg" diff --git a/egs/wsj/s5/utils/data/perturb_speed_to_allowed_lengths.py b/egs/wsj/s5/utils/data/perturb_speed_to_allowed_lengths.py index 1cd90e29517..ee2c5def33d 100755 --- a/egs/wsj/s5/utils/data/perturb_speed_to_allowed_lengths.py +++ b/egs/wsj/s5/utils/data/perturb_speed_to_allowed_lengths.py @@ -24,6 +24,7 @@ help="""Percentage of durations not covered from each side of duration histogram.""") parser.add_argument('--no-speed-perturb', action='store_true') +parser.add_argument("--only-speed-perturb", action='store_true') args = parser.parse_args() @@ -189,55 +190,46 @@ def generate_kaldi_data_files(utterances, outdir): prev_d = d # i determines the closest allowed durs - if i > 0: - allowed_dur = durs[i - 1] # this is smaller than u.dur - speed = u.dur / allowed_dur - if max(speed, 1.0/speed) > factor: - #print('rejected: {} --> dur was {} speed was {}'.format(u.id, u.dur, speed)) - continue - u1 = copy.deepcopy(u) - u1.id = 'pv1-' + u.id - u1.speaker = 'pv1-' + u.speaker - u1.wavefile = '{} sox -t wav - -t wav - speed {} | '.format(u.wavefile, speed) - u1.dur = allowed_dur - if not args.no_speed_perturb: - perturbed_utts += [u1] - - - if i < len(durs) - 1: - allowed_dur2 = durs[i] # this is bigger than u.dur - speed = u.dur / allowed_dur2 - if max(speed, 1.0/speed) > factor: - #print('no v2/v3 for: {} --> dur was {} speed was {}'.format(u.id, u.dur, speed)) - continue - - ## Add two versions for the second allowed_length - ## one version is by using speed modification using sox - ## the other is by extending by silence - u2 = copy.deepcopy(u) - u2.id = 'pv2-' + u.id - u2.speaker = 'pv2-' + u.speaker - u2.wavefile = '{} sox -t wav - -t wav - speed {} | '.format(u.wavefile, speed) - u2.dur = allowed_dur2 - if not args.no_speed_perturb: - perturbed_utts += [u2] - - delta = allowed_dur2 - u.dur - if delta <= 1e-4: - continue - u3 = copy.deepcopy(u) - prefix = 'pv3-' if not args.no_speed_perturb else '' - u3.id = prefix + u.id - u3.speaker = prefix + u.speaker - - parts = u.wavefile.split() - if len(parts) == 1: - u3.wavefile = 'extend-wav-with-silence --extra-silence-length={1} {0} - | '.format(u.wavefile, delta) - else: - assert parts[-1] == "|" - u3.wavefile = '{0} extend-wav-with-silence --extra-silence-length={1} - - | '.format(u.wavefile, delta) - u3.dur = allowed_dur2 - perturbed_utts += [u3] + allowed_dur = durs[i - 1] if i > 0 else durs[i] + speed = u.dur / allowed_dur + if max(speed, 1.0/speed) > factor: + #print('rejected: {} --> dur was {} speed was {}'.format(u.id, u.dur, speed)) + continue + u1 = copy.deepcopy(u) + prefix = 'pv1' if not args.only_speed_perturb else '' + u1.id = prefix + u.id + u1.speaker = prefix + u.speaker + parts = u.wavefile.split() + if len(parts) == 1: + u1.wavefile = 'wav-copy {0} - | sox -t wav - -t wav - speed {1} | '.format( + u.wavefile, speed) + else: + assert parts[-1] == "|" + u1.wavefile = '{0} sox -t wav - -t wav - speed {1} | '.format( + u.wavefile, speed) + u1.dur = allowed_dur + if not args.no_speed_perturb: + perturbed_utts += [u1] + + if args.only_speed_perturb: + continue + + delta = allowed_dur - u.dur + if delta <= 1e-4: + continue + u3 = copy.deepcopy(u) + prefix = 'pv3-' if not args.no_speed_perturb else '' + u3.id = prefix + u.id + u3.speaker = prefix + u.speaker + + parts = u.wavefile.split() + if len(parts) == 1: + u3.wavefile = 'extend-wav-with-silence --extra-silence-length={1} {0} - | '.format(u.wavefile, delta) + else: + assert parts[-1] == "|" + u3.wavefile = '{0} extend-wav-with-silence --extra-silence-length={1} - - | '.format(u.wavefile, delta) + u3.dur = allowed_dur2 + perturbed_utts += [u3] # 3. write to our dir generate_kaldi_data_files(perturbed_utts, args.dir) diff --git a/src/chain/chain-supervision-splitter-test.cc b/src/chain/chain-supervision-splitter-test.cc index c62edafed8a..9314da13682 100644 --- a/src/chain/chain-supervision-splitter-test.cc +++ b/src/chain/chain-supervision-splitter-test.cc @@ -65,7 +65,6 @@ void FstToLabels(const fst::StdVectorFst &fst, void TestSupervisionLatticeSplitting( const SupervisionOptions &sup_opts, - const fst::StdVectorFst &tolerance_fst, const TransitionModel &trans_model, Lattice &lat) { @@ -73,7 +72,8 @@ void TestSupervisionLatticeSplitting( chain::SupervisionLatticeSplitterOptions opts; chain::SupervisionLatticeSplitter sup_lat_splitter( - opts, trans_model, lat); + opts, sup_opts, trans_model); + sup_lat_splitter.LoadLattice(lat); std::vector state_times; int32 num_frames_lat = LatticeStateTimes(lat, &state_times); @@ -100,17 +100,10 @@ void TestSupervisionLatticeSplitting( num_frames = num_frames_lat - start_frame; } - Lattice lat_part; - sup_lat_splitter.GetFrameRange(start_frame, num_frames, &lat_part); - - ScaleLattice(fst::LatticeScale(1.0, 0.0), &lat_part); - chain::Supervision supervision_part; - - chain::PhoneLatticeToSupervision(tolerance_fst, - trans_model, lat_part, - &supervision_part); - + sup_lat_splitter.GetFrameRangeSupervision( + start_frame, num_frames, &supervision_part); + std::vector > labels; FstToLabels(supervision_part.fst, &labels); @@ -162,9 +155,52 @@ void TestSupervisionLatticeSplitting( } } +TransitionModel* GetSimpleChainTransitionModel( + ContextDependency **ctx_dep, int32 num_phones) { + + std::ostringstream oss; + + oss << "\n" + "\n" + " "; + for (int32 i = 1; i <= num_phones; i++) { + oss << i << " "; + } + oss << "\n" + " 0 0 1\n" + " 0 0.5\n" + " 1 0.5\n" + " \n" + " 1 \n" + "\n" + "\n"; + + std::string chain_input_str = oss.str(); + + HmmTopology topo; + std::istringstream iss(chain_input_str); + topo.Read(iss, false); + + const std::vector &phones = topo.GetPhones(); + + std::vector phone2num_pdf_classes (1+phones.back()); + for (size_t i = 0; i < phones.size(); i++) + phone2num_pdf_classes[phones[i]] = topo.NumPdfClasses(phones[i]); + + *ctx_dep = MonophoneContextDependency(phones, phone2num_pdf_classes); + + return new TransitionModel(**ctx_dep, topo); +} + void ChainSupervisionSplitterTest(int32 index) { ContextDependency *ctx_dep; - TransitionModel *trans_model = GenRandTransitionModel(&ctx_dep, 2); + TransitionModel *trans_model; + + if (Rand()) + trans_model = GenRandTransitionModel(&ctx_dep, 2); + else + trans_model = GetSimpleChainTransitionModel(&ctx_dep, 2); + const std::vector &phones = trans_model->GetPhones(); int32 subsample_factor = 1; @@ -204,13 +240,31 @@ void ChainSupervisionSplitterTest(int32 index) { sup_opts.lm_scale = 0.5; fst::StdVectorFst tolerance_fst; - MakeToleranceEnforcerFst(sup_opts, *trans_model, &tolerance_fst); + GetToleranceEnforcerFst(sup_opts, *trans_model, &tolerance_fst); WriteFstKaldi(std::cerr, false, tolerance_fst); - fst::ArcSort(&tolerance_fst, fst::ILabelCompare()); + TestSupervisionLatticeSplitting(sup_opts, *trans_model, lat); + + delete ctx_dep; + delete trans_model; +} - TestSupervisionLatticeSplitting(sup_opts, tolerance_fst, *trans_model, lat); +void TestToleranceFst() { + ContextDependency *ctx_dep; + TransitionModel *trans_model = GetSimpleChainTransitionModel(&ctx_dep, 2); + + chain::SupervisionOptions sup_opts; + sup_opts.left_tolerance = 1; + sup_opts.right_tolerance = 1; + sup_opts.frame_subsampling_factor = 1; + sup_opts.lm_scale = 0.5; + fst::StdVectorFst tolerance_fst; + GetToleranceEnforcerFst(sup_opts, *trans_model, &tolerance_fst); + WriteFstKaldi(std::cerr, false, tolerance_fst); + + fst::ArcSort(&tolerance_fst, fst::ILabelCompare()); + delete ctx_dep; delete trans_model; } @@ -221,6 +275,10 @@ void ChainSupervisionSplitterTest(int32 index) { int main() { using namespace kaldi; SetVerboseLevel(2); + + kaldi::chain::TestToleranceFst(); + return 0; + for (int32 i = 0; i < 10; i++) { kaldi::chain::ChainSupervisionSplitterTest(i); } diff --git a/src/chain/chain-supervision-splitter.cc b/src/chain/chain-supervision-splitter.cc index 2fd73704a6c..edff10a4648 100644 --- a/src/chain/chain-supervision-splitter.cc +++ b/src/chain/chain-supervision-splitter.cc @@ -437,6 +437,7 @@ bool SupervisionLatticeSplitter::GetSupervision( TableCompose(transition_id_fst, tolerance_fst_, &(supervision->fst), compose_opts); + } else { std::swap(transition_id_fst, supervision->fst); } @@ -449,6 +450,11 @@ bool SupervisionLatticeSplitter::GetSupervision( fst::RmEpsilon(&(supervision->fst)); fst::DeterminizeInLog(&(supervision->fst)); + + if (opts_.debug) { + std::cerr << "tolerance added fst"; + fst::WriteFstKaldi(std::cerr, false, supervision->fst); + } KALDI_ASSERT(supervision->fst.Properties(fst::kIEpsilons, true) == 0); if (supervision->fst.NumStates() == 0) { @@ -492,10 +498,22 @@ class ToleranceEnforcerFstCreator { typedef fst::StdArc::StateId StateId; typedef fst::StdArc::Label Label; - void AddSelfLoops(int32 offset); - void AddArcToTempStates(int32 offset); - void InsertSelfLoopTransitions(int32 offset); - void DeleteSelfLoopTransitions(int32 offset); + enum StateType { + kInit, + kDeletion, + kAccept, + kInsertion + }; + + inline int32 GetStateId(int32 offset, int32 forward_id, int32 type) { + return ((offset + zero_offset_index_) * (num_forward_transitions_ * 3 + 1) + + (type == kInit ? 0 : 1 + (type - 1) * num_forward_transitions_ + + forward_id) + 1); + } + + void AddArcsForOffset(int32 offset); + void AddArcsForForwardTransition(int32 offset, int32 forward_id, int32 trans_id); + void AddArcsBetweenOffsets(int32 offset, int32 forward_id, int32 trans_id); const SupervisionOptions &opts_; const TransitionModel &trans_model_; @@ -529,154 +547,127 @@ ToleranceEnforcerFstCreator::ToleranceEnforcerFstCreator( fst_->DeleteStates(); } -void ToleranceEnforcerFstCreator::AddSelfLoops(int32 offset) { - StateId state = (offset + zero_offset_index_) * (num_forward_transitions_ + 1); - for (int32 trans_id = 1; trans_id <= trans_model_.NumTransitionIds(); - trans_id++) { - int32 pdf_id = trans_model_.TransitionIdToPdf(trans_id); - fst_->AddArc(state, - fst::StdArc(trans_id, pdf_id + 1, - fst::TropicalWeight::One(), state)); - } -} +void ToleranceEnforcerFstCreator::AddArcsForForwardTransition( + int32 offset, int32 forward_id, int32 trans_id) { + StateId init_state = GetStateId(offset, forward_id, kInit); -/* This function adds arcs from each "offset" state to a temporary state - * emitting a forward-pdf. These temporary states have arcs to states - * "offset+1" and "offset-1" (other than the boundaries). These arcs will - * be added later by the function DeleteSelfLoopTransitions and - * InsertSelfLoopTransitions. - */ -void ToleranceEnforcerFstCreator::AddArcToTempStates(int32 offset) { - StateId state = (offset + zero_offset_index_) * (num_forward_transitions_ + 1); - KALDI_ASSERT(state < fst_->NumStates()); - - int32 forward_idx = 1; - for (Label trans_id = 1; - trans_id <= trans_model_.NumTransitionIds(); - trans_id++) { - if (!trans_model_.IsSelfLoop(trans_id)) { - // Add a temporary state for each non-self loop transition - KALDI_ASSERT(forward_idx <= num_forward_transitions_); - StateId next_state = state + forward_idx; - KALDI_ASSERT(next_state < fst_->NumStates()); - int32 pdf_id = trans_model_.TransitionIdToPdf(trans_id); - - fst_->AddArc(state, - fst::StdArc(trans_id, pdf_id + 1, - fst::TropicalWeight::One(), next_state)); - forward_idx++; - } - } -} + if (offset == 0 && forward_id == 0) + fst_->SetFinal(init_state, fst::TropicalWeight::One()); -/* This function adds arcs out of temporary states corresponding to each offset - * offset that will delete self-loop transition-ids. Doing so will result in - * moving to the state corresponding to offset one lower. - */ -void ToleranceEnforcerFstCreator::DeleteSelfLoopTransitions(int32 offset) { - KALDI_ASSERT(offset >= -opts_.left_tolerance && offset <= opts_.right_tolerance); + // We expect this is to be a forward transition + KALDI_ASSERT(!trans_model_.IsSelfLoop(trans_id)); + int32 forward_pdf = trans_model_.TransitionIdToPdf(trans_id); + int32 tstate = trans_model_.TransitionIdToTransitionState(trans_id); + int32 self_loop_tid = trans_model_.SelfLoopOf(tstate); + int32 self_loop_pdf = trans_model_.TransitionIdToPdf(self_loop_tid); - // If offset is at the left-tolerance, we cannot decrease it further. - if (offset == -opts_.left_tolerance) return; - int32 next_offset = offset - 1; + for (int32 i = 1; i <= 3; i++) { + StateId next_state = GetStateId(offset, forward_id, i); - StateId state = (offset + zero_offset_index_) * (num_forward_transitions_ + 1); - StateId next_offset_state = (next_offset + zero_offset_index_) - * (num_forward_transitions_ + 1); + // accept a forward transition from initial state + fst_->AddArc(init_state, + fst::StdArc(trans_id, forward_pdf + 1, + fst::TropicalWeight::One(), + next_state)); + + // epsilon-arc to initial state + fst_->AddArc(next_state, + fst::StdArc(0, 0, + fst::TropicalWeight::One(), + init_state)); + + // self-loop + fst_->AddArc(next_state, + fst::StdArc(self_loop_tid, self_loop_pdf + 1, + fst::TropicalWeight::One(), + next_state)); + } +} - KALDI_ASSERT(state < fst_->NumStates() && next_offset_state < fst_->NumStates()); +void ToleranceEnforcerFstCreator::AddArcsBetweenOffsets( + int32 offset, int32 forward_id, int32 trans_id) { + // We expect this is to be a forward transition + KALDI_ASSERT(!trans_model_.IsSelfLoop(trans_id)); + int32 tstate = trans_model_.TransitionIdToTransitionState(trans_id); + int32 self_loop_tid = trans_model_.SelfLoopOf(tstate); + int32 self_loop_pdf = trans_model_.TransitionIdToPdf(self_loop_tid); - int32 forward_idx = 1; - for (Label trans_id = 1; - trans_id <= trans_model_.NumTransitionIds(); - trans_id++) { - if (!trans_model_.IsSelfLoop(trans_id)) { - KALDI_ASSERT(forward_idx <= num_forward_transitions_); - StateId next_state = state + forward_idx; - KALDI_ASSERT(next_state < fst_->NumStates()); - // We already added an arc to this next_state in the function - // AddArcToTempStates. Now we only need to delete a self-loop - // transition, which can be done by emitting an epsilon on the output. + if (offset > -opts_.left_tolerance) { + StateId state = GetStateId(offset, forward_id, kDeletion); + StateId next_state = GetStateId(offset - 1, forward_id, kDeletion); - int32 tstate = trans_model_.TransitionIdToTransitionState(trans_id); - Label self_loop_tid = trans_model_.SelfLoopOf(tstate); + fst_->AddArc(state, + fst::StdArc(self_loop_tid, 0, + fst::TropicalWeight::One(), + next_state)); + } - fst_->AddArc(next_state, - fst::StdArc(self_loop_tid, 0, - fst::TropicalWeight::One(), next_offset_state)); + if (offset < opts_.right_tolerance) { + StateId state = GetStateId(offset, forward_id, kInsertion); + StateId next_state = GetStateId(offset + 1, forward_id, kInsertion); - forward_idx++; + fst_->AddArc(state, + fst::StdArc(0, self_loop_pdf + 1, + fst::TropicalWeight::One(), + next_state)); + } + + if (offset == 0) { + if (forward_id == 0) { + StateId state = GetStateId(offset, forward_id, kInit); + fst_->AddArc(0, + fst::StdArc(0, 0, + fst::TropicalWeight::One(), + state)); + } + + for (int32 i = 1; i <= 3; i++) { + StateId next_state = GetStateId(offset, forward_id, i); + fst_->AddArc(0, + fst::StdArc(self_loop_tid, self_loop_pdf + 1, + fst::TropicalWeight::One(), + next_state)); } } } -/* This function adds arcs out of temporary states corresponding to each offset - * offset that will insert self-loop transition-ids. Doing so will result in - * moving to the state corresponding to offset one higher. - */ -void ToleranceEnforcerFstCreator::InsertSelfLoopTransitions(int32 offset) { - KALDI_ASSERT(offset >= -opts_.left_tolerance && offset <= opts_.right_tolerance); - - // If offset is at the right-tolerance, we cannot increase it further. - if (offset == opts_.right_tolerance) return; - int32 next_offset = offset + 1; - - StateId state = (offset + zero_offset_index_) * (num_forward_transitions_ + 1); - StateId next_offset_state = (next_offset + zero_offset_index_) - * (num_forward_transitions_ + 1); - - KALDI_ASSERT(state < fst_->NumStates() && next_offset_state < fst_->NumStates()); - - int32 forward_idx = 1; - for (Label trans_id = 1; - trans_id <= trans_model_.NumTransitionIds(); +void ToleranceEnforcerFstCreator::AddArcsForOffset(int32 offset) { + int32 forward_id = 0; + for (int32 trans_id = 1; trans_id <= trans_model_.NumTransitionIds(); trans_id++) { if (!trans_model_.IsSelfLoop(trans_id)) { - KALDI_ASSERT(forward_idx <= num_forward_transitions_); - StateId next_state = state + forward_idx; - KALDI_ASSERT(next_state < fst_->NumStates()); - // We already added an arc to this next_state in the function - // AddArcToTempStates. Now we only need to insert a self-loop - // transition, which can be done by emitting an epsilon on the input - // side with the self-loop pdf on the output. - - int32 tstate = trans_model_.TransitionIdToTransitionState(trans_id); - int32 self_loop_pdf = trans_model_.TransitionStateToSelfLoopPdf(tstate); - - fst_->AddArc(next_state, - fst::StdArc(0, self_loop_pdf + 1, - fst::TropicalWeight::One(), next_offset_state)); - - forward_idx++; + AddArcsForForwardTransition(offset, forward_id, trans_id); + AddArcsBetweenOffsets(offset, forward_id, trans_id); + forward_id++; } } + } void ToleranceEnforcerFstCreator::MakeFst() { - int32 num_states = num_offsets_ * (num_forward_transitions_ + 1); + int32 num_states = num_offsets_ * (3 * num_forward_transitions_ + 1) + 1; fst_->ReserveStates(num_states); for (int32 s = 0; s < num_states; s++) fst_->AddState(); - StateId start_state = zero_offset_index_ * (num_forward_transitions_ + 1); - fst_->SetStart(start_state); - fst_->SetFinal(start_state, fst::TropicalWeight::One()); + fst_->SetStart(0); for (int32 o = -opts_.left_tolerance; o <= opts_.right_tolerance; o++) { - AddSelfLoops(o); - AddArcToTempStates(o); - DeleteSelfLoopTransitions(o); - InsertSelfLoopTransitions(o); + AddArcsForOffset(o); } - - KALDI_ASSERT(fst_->Start() == zero_offset_index_ * (num_forward_transitions_ + 1)); - + fst::ArcSort(fst_, fst::ILabelCompare()); } void SupervisionLatticeSplitter::MakeToleranceEnforcerFst() { - ToleranceEnforcerFstCreator creator(sup_opts_, trans_model_, &tolerance_fst_); + GetToleranceEnforcerFst(sup_opts_, trans_model_, &tolerance_fst_); +} + +void GetToleranceEnforcerFst(const SupervisionOptions &sup_opts, + const TransitionModel &trans_model, + fst::StdVectorFst *tolerance_fst) { + ToleranceEnforcerFstCreator creator(sup_opts, trans_model, tolerance_fst); creator.MakeFst(); } diff --git a/src/chain/chain-supervision-splitter.h b/src/chain/chain-supervision-splitter.h index 8b6b43caec5..bf73a2c8833 100644 --- a/src/chain/chain-supervision-splitter.h +++ b/src/chain/chain-supervision-splitter.h @@ -172,6 +172,8 @@ class SupervisionLatticeSplitter { // This will be computed when PrepareLattice function is called. LatticeInfo lat_scores_; }; + +void GetToleranceEnforcerFst(const SupervisionOptions &opts, const TransitionModel &trans_model, fst::StdVectorFst *tolerance_fst); bool PhoneLatticeToSupervision(const fst::StdVectorFst &tolerance_fst, const TransitionModel &trans_model, diff --git a/src/chain/chain-supervision.cc b/src/chain/chain-supervision.cc index 1a57522b1d1..42d4a57bbdf 100644 --- a/src/chain/chain-supervision.cc +++ b/src/chain/chain-supervision.cc @@ -20,6 +20,7 @@ #include "chain/chain-supervision.h" #include "lat/lattice-functions.h" #include "lat/push-lattice.h" +#include "lat/sausages.h" #include "util/text-utils.h" #include "hmm/hmm-utils.h" #include @@ -150,9 +151,17 @@ bool ProtoSupervision::operator == (const ProtoSupervision &other) const { fst::Equal(fst, other.fst)); } -bool PhoneLatticeToProtoSupervisionInternal(const SupervisionOptions &opts, - const CompactLattice &lat, - ProtoSupervision *proto_supervision) { +///void PushInLog(fst::VectorFst *fst) { +/// fst::VectorFst *fst_log = new VectorFst; +/// fst::Cast(*fst, fst_log); +/// fst::Push(fst_log); +/// fst::Cast(*fst_log, fst); +///} + +bool PhoneLatticeToProtoSupervisionInternalSimple( + const SupervisionOptions &opts, + const CompactLattice &lat, + ProtoSupervision *proto_supervision) { opts.Check(); ConstIntegerSet silence_set; @@ -235,17 +244,157 @@ bool PhoneLatticeToProtoSupervisionInternal(const SupervisionOptions &opts, KALDI_ASSERT(!proto_supervision->allowed_phones[t_subsampled].empty()); SortAndUniq(&(proto_supervision->allowed_phones[t_subsampled])); } + + return true; +} + +bool PhoneLatticeToProtoSupervisionInternalMbr( + const SupervisionOptions &opts, + const CompactLattice &lat, + ProtoSupervision *proto_supervision) { + opts.Check(); + + ConstIntegerSet silence_set; + if (!opts.silence_phones_str.empty()) { + std::vector silence_phones; + if (!SplitStringToIntegers(opts.silence_phones_str, ":,", false, + &silence_phones)) + KALDI_ERR << "Invalid silence-phones string " << opts.silence_phones_str; + silence_set.Init(silence_phones); + } + + if (lat.NumStates() == 0) { + KALDI_WARN << "Empty lattice provided"; + return false; + } + + MinimumBayesRisk mbr(lat); + const std::vector > > &sausage_stats + = mbr.GetSausageStats(); + + int32 num_states = lat.NumStates(); + proto_supervision->fst.DeleteStates(); + proto_supervision->fst.ReserveStates(sausage_stats.size() + 1); + std::vector state_times; + int32 num_frames = CompactLatticeStateTimes(lat, &state_times), + factor = opts.frame_subsampling_factor, + num_frames_subsampled = (num_frames + factor - 1) / factor; + + proto_supervision->allowed_phones.clear(); + proto_supervision->allowed_phones.resize(num_frames_subsampled); + + for (int32 state = 0; state < num_states; state++) { + int32 state_time = state_times[state]; + for (fst::ArcIterator aiter(lat, state); !aiter.Done(); + aiter.Next()) { + const CompactLatticeArc &lat_arc = aiter.Value(); + int32 next_state_time = state_time + lat_arc.weight.String().size(); + int32 phone = lat_arc.ilabel; // It's an acceptor so ilabel == ollabel. + if (phone == 0) { + KALDI_WARN << "CompactLattice has epsilon arc. Unexpected."; + continue; + } + + int32 left_tolerance = opts.left_tolerance; + int32 right_tolerance = opts.right_tolerance; + if (!opts.silence_phones_str.empty()) { + if (silence_set.count(phone) > 0) { + left_tolerance = opts.left_tolerance_silence; + right_tolerance = opts.right_tolerance_silence; + } + } + + int32 t_begin = std::max(0, (state_time - left_tolerance)), + t_end = std::min(num_frames, + (next_state_time + right_tolerance)), + t_begin_subsampled = (t_begin + factor - 1)/ factor, + t_end_subsampled = (t_end + factor - 1)/ factor; + for (int32 t_subsampled = t_begin_subsampled; + t_subsampled < t_end_subsampled; t_subsampled++) + proto_supervision->allowed_phones[t_subsampled].push_back(phone); + } + if (lat.Final(state) != CompactLatticeWeight::Zero()) { + if (state_times[state] != num_frames) { + KALDI_WARN << "Time of final state " << state << " in lattice is " + << "not equal to number of frames " << num_frames + << ". Are you sure the lattice is phone-aligned? " + << "Rejecting it."; + return false; + } + } + } + + proto_supervision->fst.AddState(); + proto_supervision->fst.SetStart(0); + for (int32 n = 0; n < sausage_stats.size(); n++) { + int32 s = proto_supervision->fst.AddState(); + KALDI_ASSERT(s == n + 1); + + const std::vector > &sausage_segment = + sausage_stats[n]; + std::vector >::const_iterator it = + sausage_segment.begin(), end = sausage_segment.end(); + + auto max_it = std::max_element(sausage_segment.begin(), end, + [](const std::pair &left, const std::pair &right) { + return left.second < right.second; + }); + BaseFloat max_prob = max_it->second; + + for (; it != end; ++it) { + int32 phone = it->first; + BaseFloat prob = it->second; + if (prob < opts.min_prob) continue; // skip low-probability phone + + proto_supervision->fst.AddArc(n, + fst::StdArc(phone, phone, + fst::TropicalWeight( + opts.arc_scale * (Log(prob) - Log(max_prob)) + + opts.phone_ins_penalty), + n + 1)); + } + } + + proto_supervision->fst.SetFinal(sausage_stats.size(), + fst::TropicalWeight::One()); + + for (int32 t_subsampled = 0; t_subsampled < num_frames_subsampled; + t_subsampled++) { + KALDI_ASSERT(!proto_supervision->allowed_phones[t_subsampled].empty()); + SortAndUniq(&(proto_supervision->allowed_phones[t_subsampled])); + } + + if (GetVerboseLevel() > 1) { + std::cerr << "proto-supervision"; + fst::WriteFstKaldi(std::cerr, false, proto_supervision->fst); + } + + fst::RmEpsilon(&(proto_supervision->fst)); + if (!TryDeterminizeMinimize(kSupervisionMaxStates, &(proto_supervision->fst))) { + KALDI_WARN << "Failed to determinize sausage proto-supervision"; + return false; + } + return true; } bool PhoneLatticeToProtoSupervision(const SupervisionOptions &opts, const CompactLattice &lat, ProtoSupervision *proto_supervision) { - if (!PhoneLatticeToProtoSupervisionInternal(opts, lat, proto_supervision)) - return false; - if (opts.lm_scale != 0.0) + + if (!opts.use_mbr_decode) { + if (!PhoneLatticeToProtoSupervisionInternalSimple(opts, lat, proto_supervision)) + return false; + if (opts.lm_scale != 0.0) + fst::Push(&(proto_supervision->fst), + fst::REWEIGHT_TO_INITIAL, fst::kDelta, true); + } else { + if (!PhoneLatticeToProtoSupervisionInternalMbr(opts, lat, proto_supervision)) + return false; fst::Push(&(proto_supervision->fst), fst::REWEIGHT_TO_INITIAL, fst::kDelta, true); + } + return true; } diff --git a/src/chain/chain-supervision.h b/src/chain/chain-supervision.h index 88a5c05efbe..0124111b0f5 100644 --- a/src/chain/chain-supervision.h +++ b/src/chain/chain-supervision.h @@ -56,6 +56,9 @@ struct SupervisionOptions { int32 left_tolerance_silence; int32 right_tolerance_silence; std::string silence_phones_str; + bool use_mbr_decode; + BaseFloat min_prob; + BaseFloat arc_scale; SupervisionOptions(): left_tolerance(5), right_tolerance(5), @@ -64,7 +67,10 @@ struct SupervisionOptions { lm_scale(0.0), phone_ins_penalty(0.0), left_tolerance_silence(0), - right_tolerance_silence(0) { } + right_tolerance_silence(0), + use_mbr_decode(false), + min_prob(0.01), + arc_scale(1.0) { } void Register(OptionsItf *opts) { opts->Register("left-tolerance", &left_tolerance, "Left tolerance for " @@ -89,6 +95,14 @@ struct SupervisionOptions { "shift in silence phone position relative to the alignment"); opts->Register("silence-phones", &silence_phones_str, "A comma separated list of silence phones"); + opts->Register("use-mbr-decode", &use_mbr_decode, + "Use MBR decoding to convert phone lattice to " + "proto-supervision"); + opts->Register("min-prob", &min_prob, + "Minimum probability of sausage arc to keep. " + "Applicable only when --use-mbr-decode is true."); + opts->Register("arc-scale", &arc_scale, + "Arc scale for sausage arcs"); } void Check() const; }; diff --git a/src/chainbin/nnet3-chain-normalize-egs.cc b/src/chainbin/nnet3-chain-normalize-egs.cc index 139c08e7799..7b99c6bd1da 100644 --- a/src/chainbin/nnet3-chain-normalize-egs.cc +++ b/src/chainbin/nnet3-chain-normalize-egs.cc @@ -61,8 +61,8 @@ int main(int argc, char *argv[]) { fst::StdVectorFst normalization_fst; ReadFstKaldi(normalization_fst_rxfilename, &normalization_fst); - if (scale <= 0.0) { - KALDI_ERR << "Invalid scale on normalization FST; must be > 0.0"; + if (scale < 0.0) { + KALDI_ERR << "Invalid scale on normalization FST; must be >= 0.0"; } if (scale != 1.0) { From ea6ed69cfe52579017179bbe3635782e914af63c Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Mon, 16 Oct 2017 18:35:57 -0400 Subject: [PATCH 074/174] semisup: Adding semisup recipes --- .../run_tdnn_11k_semisupervised_conf_e_old.sh | 450 +++++++++++++++++ .../run_tdnn_11k_semisupervised_conf_m.sh | 433 ++++++++++++++++ .../run_tdnn_11k_semisupervised_conf_n.sh | 436 ++++++++++++++++ .../run_tdnn_11k_semisupervised_conf_q.sh | 436 ++++++++++++++++ .../run_tdnn_11k_semisupervised_conf_r.sh | 436 ++++++++++++++++ .../chain/tuning/run_tdnn_15k_best_path_a.sh | 441 +++++++++++++++++ .../run_tdnn_15k_semisupervised_conf_t.sh | 463 +++++++++++++++++ .../run_tdnn_15k_semisupervised_conf_u.sh | 465 ++++++++++++++++++ .../run_tdnn_15k_semisupervised_conf_v.sh | 437 ++++++++++++++++ .../semisup/chain/tuning/run_tdnn_50k.sh | 194 ++++++++ .../chain/tuning/run_tdnn_50k_best_path_a.sh | 441 +++++++++++++++++ 11 files changed, 4632 insertions(+) create mode 100644 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_e_old.sh create mode 100644 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_m.sh create mode 100644 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_n.sh create mode 100644 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_q.sh create mode 100644 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_r.sh create mode 100755 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_best_path_a.sh create mode 100644 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_t.sh create mode 100755 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_u.sh create mode 100644 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_v.sh create mode 100755 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k.sh create mode 100755 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_best_path_a.sh diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_e_old.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_e_old.sh new file mode 100644 index 00000000000..567cee619a0 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_e_old.sh @@ -0,0 +1,450 @@ +#!/bin/bash + +# This script is same as _d, but uses a weight of 1.0 for unsupervised set. +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 5,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup11k_250k # for reference +exp=exp/semisup_11k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup11k +semi_affix=semi11k_250k # affix relating train-set splitting proportion + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +lm_opts= + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.0 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=2.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=2 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1e_old # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +comb_egs_dir= +tree_affix=fg + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +tree_affix=${tree_affix}_${semi_affix} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor + + steps/nnet3/chain/build_tree_multiple_sources.sh \ + --frame-subsampling-factor $frame_subsampling_factor \ + --use-fmllr false \ + --cmd "$train_cmd" 10000 data/lang_chain \ + data/${supervised_set} $sup_ali_dir \ + data/${unsupervised_set} \ + $chaindir/best_path_${unsupervised_set}${decode_affix} \ + $treedir +fi + +dir=$exp/chain${nnet3_affix}/tdnn${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --weights $lm_weights --cmd "$train_cmd" --lm-opts "$lm_opts" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output-0 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output-1 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + output-layer name=output-1-xent input=prefinal-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ + cp $dir/configs/final.config{,.orig} + + cat $dir/configs/final.config.orig | \ + perl -pe 's/component=output-1.affine/component=output-0.affine/g; + s/component=output-1-xent.affine/component=output-0-xent.affine/g;' > \ + $dir/configs/final.config +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri3_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +if [ -z "$comb_egs_dir" ]; then + comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + + if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. + fi +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=128/300=64" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${lang_test_suffix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_m.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_m.sh new file mode 100644 index 00000000000..6c8b1650082 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_m.sh @@ -0,0 +1,433 @@ +#!/bin/bash + +# This script is same as _d, but uses a weight of 1.0 for unsupervised set. +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 5,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup15k_250k # for reference +exp=exp/semisup_15k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup15k +semi_affix=semi15k_250k # affix relating train-set splitting proportion + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1m # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix= +unsup_egs_opts= + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +tree_affix= + +RANDOM=0 + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri3_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=128/300=64" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${lang_test_suffix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_n.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_n.sh new file mode 100644 index 00000000000..ca2014d1f76 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_n.sh @@ -0,0 +1,436 @@ +#!/bin/bash + +# This script is same as _d, but uses a weight of 1.0 for unsupervised set. +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 5,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup15k_250k # for reference +exp=exp/semisup_15k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup15k +semi_affix=semi15k_250k # affix relating train-set splitting proportion + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1n # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix= +unsup_egs_opts= +apply_deriv_weights=true + + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +tree_affix= + +RANDOM=0 + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri3_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs_split.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=128/300=64" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${lang_test_suffix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_q.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_q.sh new file mode 100644 index 00000000000..5207acd410c --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_q.sh @@ -0,0 +1,436 @@ +#!/bin/bash + +# This script is same as _d, but uses a weight of 1.0 for unsupervised set. +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 5,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup15k_250k # for reference +exp=exp/semisup_15k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup15k +semi_affix=semi15k_250k # affix relating train-set splitting proportion + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1q # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix= +unsup_egs_opts= + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +tree_affix= + +RANDOM=0 + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri3_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs_split.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp "" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + #$chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=128/300=64" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${lang_test_suffix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_r.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_r.sh new file mode 100644 index 00000000000..4b083f356b2 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_r.sh @@ -0,0 +1,436 @@ +#!/bin/bash + +# This script is same as _d, but uses a weight of 1.0 for unsupervised set. +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 5,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup15k_250k # for reference +exp=exp/semisup_15k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup15k +semi_affix=semi15k_250k # affix relating train-set splitting proportion + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1r # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix= +unsup_egs_opts= +apply_deriv_weights=true + + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +tree_affix= + +RANDOM=0 + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri3_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs_split.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=128/300=64" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${lang_test_suffix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_best_path_a.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_best_path_a.sh new file mode 100755 index 00000000000..a9832070763 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_best_path_a.sh @@ -0,0 +1,441 @@ +#!/bin/bash + +# This script is same as _d, but uses a weight of 1.0 for unsupervised set. +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 5,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup15k_250k # for reference +exp=exp/semisup_15k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup15k +semi_affix=semi15k_250k # affix relating train-set splitting proportion + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix=_unphdet +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=best_path_comb1a # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix= +unsup_egs_opts= +apply_deriv_weights=true + + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size="150=128/300=64" +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_tol${tolerance} +tree_affix= + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor + + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri3_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} + +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --keep-only-best-path true \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${lang_test_suffix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_t.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_t.sh new file mode 100644 index 00000000000..c2f5f9ec93b --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_t.sh @@ -0,0 +1,463 @@ +#!/bin/bash + +# This script is same as _d, but uses a weight of 1.0 for unsupervised set. +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 5,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup15k_250k # for reference +exp=exp/semisup_15k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup15k +semi_affix=semi15k_250k # affix relating train-set splitting proportion + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +lm_opts= + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.0 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=2.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=2 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1t # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix= + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size="150=128,64/300=64,32/600=32,16,8/1200=16,8,4" +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor + + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" --lm-opts "$lm_opts" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri3_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp_sil +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --no-chunking true \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires \ + --generate-egs-scp true \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${lang_test_suffix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_u.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_u.sh new file mode 100755 index 00000000000..fdd7a3da189 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_u.sh @@ -0,0 +1,465 @@ +#!/bin/bash + +# This script is same as _d, but uses a weight of 1.0 for unsupervised set. +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 5,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup15k_250k # for reference +exp=exp/semisup_15k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup15k +semi_affix=semi15k_250k # affix relating train-set splitting proportion + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +lm_opts= + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1u # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix= + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size="150=128,64/300=64,32/600=32,16,8/1200=16,8,4" +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor + + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" --lm-opts "$lm_opts" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri3_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_spEx +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --no-chunking true \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires \ + --generate-egs-scp true \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${lang_test_suffix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_v.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_v.sh new file mode 100644 index 00000000000..c7998761dd7 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_v.sh @@ -0,0 +1,437 @@ +#!/bin/bash + +# This script is same as _d, but uses a weight of 1.0 for unsupervised set. +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 5,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup15k_250k # for reference +exp=exp/semisup_15k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup15k +semi_affix=semi15k_250k # affix relating train-set splitting proportion + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +arc_scale=1.0 # arc-scale for sausage arcs +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1v # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix= +unsup_egs_opts= +apply_deriv_weights=true + + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_awt${arc_scale}_tol${tolerance} +tree_affix= + +RANDOM=0 + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri3_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale 0.0 \ + --use-mbr-decode true --arc-scale $arc_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=128/300=64" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${lang_test_suffix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k.sh new file mode 100755 index 00000000000..bf82256545a --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k.sh @@ -0,0 +1,194 @@ +#!/bin/bash +set -e + +# This is fisher chain recipe for training a model on a subset of around 10 hours. + +# configs for 'chain' +stage=0 +tdnn_affix=7b +train_stage=-10 +get_egs_stage=-10 +decode_iter= +train_set=train_sup50k +ivector_train_set=semisup50k_250k +tree_affix= +nnet3_affix=_semi50k_250k +chain_affix=_semi50k_250k +exp=exp/semisup_50k +gmm=tri4a +xent_regularize=0.1 +hidden_dim=500 + +# training options +num_epochs=8 +remove_egs=false +common_egs_dir= +minibatch_size=128 + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate -1 \ + --cmd "$train_cmd" 4000 data/${train_set} $lang $gmm_dir $treedir || exit 1 +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$common_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize 0.1 \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +decode_suff= +if [ $stage -le 15 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}${decode_suff} || exit 1; + ) & + done +fi +wait; +exit 0; diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_best_path_a.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_best_path_a.sh new file mode 100755 index 00000000000..d4fa3f5d20b --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_best_path_a.sh @@ -0,0 +1,441 @@ +#!/bin/bash + +# This script is same as _d, but uses a weight of 1.0 for unsupervised set. +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 3,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup50k_250k # for reference +exp=exp/semisup_50k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup50k +semi_affix=semi50k_250k # affix relating train-set splitting proportion + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix=_unphdet +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=best_path_comb1a # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=3,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix= +unsup_egs_opts= +apply_deriv_weights=true + + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size="150=128/300=64" +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_tol${tolerance} +tree_affix= + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor + + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri3_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} + +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --keep-only-best-path true \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${lang_test_suffix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + From bacca8b044469ca59c9b968b7bb0e81d770068f0 Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Tue, 17 Oct 2017 03:06:31 -0400 Subject: [PATCH 075/174] Minor bug fix in get_egs.sh --- egs/wsj/s5/steps/nnet3/chain/get_egs.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/egs/wsj/s5/steps/nnet3/chain/get_egs.sh b/egs/wsj/s5/steps/nnet3/chain/get_egs.sh index 196c1f9d22b..35a8e72eaec 100755 --- a/egs/wsj/s5/steps/nnet3/chain/get_egs.sh +++ b/egs/wsj/s5/steps/nnet3/chain/get_egs.sh @@ -310,7 +310,8 @@ if [ $stage -le 2 ]; then for id in $(seq $num_lat_jobs); do cat $dir/lat.$id.scp; done > $dir/lat.scp else - ln -s `readlink -f $lat_copy_src`/lat.*.{ark,scp} $dir/ + ln -sf `readlink -f $lat_copy_src`/lat.*.{ark,scp} $dir/ + ln -sf `readlink -f $lat_copy_src`/lat.scp $dir/ fi fi From 417ecfd8ddb3faa7b7c1e6d38c4d6b29e190fbe8 Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Tue, 17 Oct 2017 14:13:05 -0400 Subject: [PATCH 076/174] Best path system recipe --- .../chain/tuning/run_tdnn_15k_best_path_b.sh | 451 ++++++++++++++++++ egs/wsj/s5/steps/best_path_weights.sh | 9 +- egs/wsj/s5/steps/nnet3/chain/get_egs.sh | 14 +- 3 files changed, 462 insertions(+), 12 deletions(-) create mode 100755 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_best_path_b.sh diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_best_path_b.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_best_path_b.sh new file mode 100755 index 00000000000..b01cd361c22 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_best_path_b.sh @@ -0,0 +1,451 @@ +#!/bin/bash + +# This script is same as _d, but uses a weight of 1.0 for unsupervised set. +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 5,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup15k_250k # for reference +exp=exp/semisup_15k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup15k +semi_affix=semi15k_250k # affix relating train-set splitting proportion + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix=_unphdet +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=best_path_comb1b # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix= +unsup_egs_opts= +apply_deriv_weights=true + + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size="150=128/300=64" +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_tol${tolerance} +tree_affix= + +RANDOM=0 + +if ! cuda-compiled; then + cat < $out_dir/lat.JOB.gz" + + echo $this_nj/$out_dir/num_jobs +fi + +decode_affix=${decode_affix}_fg + +dir=$exp/chain${nnet3_affix}/tdnn${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} + +if [ $stage -le 10 ]; then + steps/subset_ali_dir.sh --cmd "$train_cmd" \ + data/${unsupervised_set} data/${unsupervised_set}_sp_hires \ + $chaindir/best_path_${unsupervised_set}_sp${decode_affix} \ + $chaindir/best_path_${unsupervised_set}${decode_affix} + echo $frame_subsampling_factor > $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor + + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri3_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/best_path_lats_${unsupervised_set}${decode_affix} + +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${lang_test_suffix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/wsj/s5/steps/best_path_weights.sh b/egs/wsj/s5/steps/best_path_weights.sh index c2e0c60f961..b70acae0362 100755 --- a/egs/wsj/s5/steps/best_path_weights.sh +++ b/egs/wsj/s5/steps/best_path_weights.sh @@ -38,6 +38,7 @@ set -e cmd=run.pl stage=-10 acwt=0.1 +write_words=false #end configuration section. help_message="Usage: "$(basename $0)" [options] [:weight] [:weight] [[:weight] ... ] @@ -70,12 +71,18 @@ nj=`cat $decode_dir/num_jobs` mkdir -p $dir +words_wspecifier=ark:/dev/null + +if $write_words; then + words_wspecifier="ark,t:| utils/int2sym.pl -f 2- $lang/words.txt > words.JOB.txt" +fi + if [ $stage -lt -1 ]; then mkdir -p $dir/log $cmd JOB=1:$nj $dir/log/best_path.JOB.log \ lattice-best-path --acoustic-scale=$acwt \ "ark,s,cs:gunzip -c $decode_dir/lat.JOB.gz |" \ - ark:/dev/null "ark:| gzip -c > $dir/ali.JOB.gz" || exit 1 + "$words_wspecifier" "ark:| gzip -c > $dir/ali.JOB.gz" || exit 1 fi src_dir=`dirname $decode_dir` diff --git a/egs/wsj/s5/steps/nnet3/chain/get_egs.sh b/egs/wsj/s5/steps/nnet3/chain/get_egs.sh index 35a8e72eaec..ba89c94f29f 100755 --- a/egs/wsj/s5/steps/nnet3/chain/get_egs.sh +++ b/egs/wsj/s5/steps/nnet3/chain/get_egs.sh @@ -80,7 +80,6 @@ generate_egs_scp=false no_chunking=false use_mbr_decode=false arc_scale=1.0 -keep_only_best_path=false lat_copy_src= echo "$0 $@" # Print the command line for logging @@ -298,16 +297,9 @@ if [ $stage -le 2 ]; then echo "$0: copying training lattices" if [ -z "$lat_copy_src" ]; then - if ! $keep_only_best_path; then - $cmd --max-jobs-run 6 JOB=1:$num_lat_jobs $dir/log/lattice_copy.JOB.log \ - lattice-copy "ark:gunzip -c $latdir/lat.JOB.gz|" ark,scp:$dir/lat.JOB.ark,$dir/lat.JOB.scp || exit 1; - else - $cmd --max-jobs-run 6 JOB=1:$num_lat_jobs $dir/log/lattice_copy.JOB.log \ - lattice-interp --alpha=1 "ark:gunzip -c $latdir/lat.JOB.gz|" \ - "ark:gunzip -c $latdir/lat.JOB.gz | lattice-1best --acoustic-scale=$acwt ark:- ark:- |" \ - ark,scp:$dir/lat.JOB.ark,$dir/lat.JOB.scp || exit 1; - fi - + $cmd --max-jobs-run 6 JOB=1:$num_lat_jobs $dir/log/lattice_copy.JOB.log \ + lattice-copy "ark:gunzip -c $latdir/lat.JOB.gz|" ark,scp:$dir/lat.JOB.ark,$dir/lat.JOB.scp || exit 1; + for id in $(seq $num_lat_jobs); do cat $dir/lat.$id.scp; done > $dir/lat.scp else ln -sf `readlink -f $lat_copy_src`/lat.*.{ark,scp} $dir/ From 6f0de80775d0e4bc7b6a7c5c0a89212a9d984678 Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Tue, 17 Oct 2017 22:28:03 -0400 Subject: [PATCH 077/174] Add some minor check --- .../chain/tuning/run_tdnn_11k_semisupervised_conf_m.sh | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_m.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_m.sh index 6c8b1650082..0c306dfa05c 100644 --- a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_m.sh +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_m.sh @@ -152,11 +152,9 @@ cmvn_opts=`cat $chaindir/cmvn_opts` || exit 1 sup_ali_dir=$exp/tri3 treedir=$exp/chain${nnet3_affix}/tree_${tree_affix} -if [ $stage -le 9 ]; then - if [ -f $treedir/final.mdl ]; then - echo "$0: $treedir/final.mdl already exists. Remove it and try again." - exit 1 - fi +if [ ! -f $treedir/final.mdl ]; then + echo "$0: $treedir/final.mdl does not exist." + exit 1 fi dir=$exp/chain${nnet3_affix}/tdnn${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} From c6aa0e461cf9c30ee0a8abb3dcf81b8250f97478 Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Thu, 19 Oct 2017 15:13:36 -0400 Subject: [PATCH 078/174] Updates to work with RNNLM --- .../run_tdnn_11k_semisupervised_conf_m.sh | 7 +- .../run_tdnn_11k_semisupervised_conf_n.sh | 10 +- .../run_tdnn_15k_semisupervised_conf_t.sh | 125 +++++++----------- .../run_tdnn_15k_semisupervised_conf_u.sh | 5 +- egs/wsj/s5/steps/lmrescore_rnnlm_lat.sh | 13 +- egs/wsj/s5/steps/nnet3/chain/get_egs_split.sh | 30 +++-- src/chain/chain-supervision-splitter-test.cc | 33 +++-- src/latbin/lattice-expand-ngram.cc | 64 +++++++-- src/tfrnnlmbin/lattice-lmrescore-tf-rnnlm.cc | 71 ++++++++-- 9 files changed, 229 insertions(+), 129 deletions(-) diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_m.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_m.sh index 0c306dfa05c..2c99f6e4a72 100644 --- a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_m.sh +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_m.sh @@ -1,6 +1,6 @@ #!/bin/bash -# This script is same as _d, but uses a weight of 1.0 for unsupervised set. +# This script is same as _e, but uses tree trained only on supervised data. # unsup_frames_per_eg=150 # Deriv weights: Lattice posterior of best path pdf # Unsupervised weight: 1.0 @@ -118,14 +118,15 @@ for dset in $unsupervised_set; do if [ $stage -le 4 ]; then echo "$0: getting the decoding lattices for the unsupervised subset using the chain model at: $chaindir" steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \ - --acwt 1.0 --post-decode-acwt 10.0 \ + --acwt 1.0 --post-decode-acwt 10.0 --write-compact false \ --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ --scoring-opts "--min-lmwt 10 --max-lmwt 10" \ $graphdir data/${dset}_sp_hires $chaindir/decode_${dset}_sp${decode_affix} fi if [ $stage -le 5 ]; then - steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang_test${graph_affix} \ + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" --write-compact false \ + data/lang_test${graph_affix} \ data/lang_test${graph_affix}_fg data/${dset}_sp_hires \ $chaindir/decode_${dset}_sp${decode_affix} \ $chaindir/decode_${dset}_sp${decode_affix}_fg diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_n.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_n.sh index ca2014d1f76..42fa4e68283 100644 --- a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_n.sh +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_n.sh @@ -1,6 +1,6 @@ #!/bin/bash -# This script is same as _d, but uses a weight of 1.0 for unsupervised set. +# This script is same as _m, but uses split lattices for supervision. # unsup_frames_per_eg=150 # Deriv weights: Lattice posterior of best path pdf # Unsupervised weight: 1.0 @@ -155,11 +155,9 @@ cmvn_opts=`cat $chaindir/cmvn_opts` || exit 1 sup_ali_dir=$exp/tri3 treedir=$exp/chain${nnet3_affix}/tree_${tree_affix} -if [ $stage -le 9 ]; then - if [ -f $treedir/final.mdl ]; then - echo "$0: $treedir/final.mdl already exists. Remove it and try again." - exit 1 - fi +if [ ! -f $treedir/final.mdl ]; then + echo "$0: $treedir/final.mdl does not exist." + exit 1 fi dir=$exp/chain${nnet3_affix}/tdnn${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_t.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_t.sh index c2f5f9ec93b..b90008afaf5 100644 --- a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_t.sh +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_t.sh @@ -1,11 +1,11 @@ #!/bin/bash -# This script is same as _d, but uses a weight of 1.0 for unsupervised set. +# This script is same as _n, but uses RNN-LM for decoding. # unsup_frames_per_eg=150 # Deriv weights: Lattice posterior of best path pdf # Unsupervised weight: 1.0 # Weights for phone LM (supervised, unsupervises): 5,2 -# LM for decoding unsupervised data: 4gram +# LM for decoding unsupervised data: tf-fast-lstm set -u -e -o pipefail @@ -23,32 +23,38 @@ semi_affix=semi15k_250k # affix relating train-set splitting proportion tdnn_affix=7b # affix for the supervised chain-model directory train_supervised_opts="--stage -10 --train-stage -10" -lm_opts= - # Unsupervised options decode_affix= egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly -lattice_lm_scale=0.0 # lm-scale for using the weights from unsupervised lattices -lattice_prune_beam=2.0 # If supplied will prune the lattices prior to getting egs for unsupervised data -tolerance=2 +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph phone_insertion_penalty= # Semi-supervised options -comb_affix=comb1t # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +comb_affix=comb1n # affix for new chain-model directory trained on the combined supervised+unsupervised subsets supervision_weights=1.0,1.0 lm_weights=5,2 sup_egs_dir= unsup_egs_dir= tree_affix= +unsup_egs_opts= +apply_deriv_weights=true + +# RNN-LM opts +rnnlm_weight=0.5 +rnnlm_dir=data/tf_fast_lstm_ex250k +rnnlm_affix=unk.fast.tfrnnlm +rnnlm_beam= extra_left_context=0 extra_right_context=0 xent_regularize=0.1 hidden_dim=725 -minibatch_size="150=128,64/300=64,32/600=32,16,8/1200=16,8,4" +minibatch_size=128 # to tune: # frames_per_eg for unsupervised @@ -72,6 +78,7 @@ echo "$0 $@" # Print the command line for logging nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs decode_affix=${decode_affix}${graph_affix} egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +tree_affix= RANDOM=0 @@ -85,7 +92,7 @@ fi if false && [ $stage -le 1 ]; then echo "$0: chain training on the supervised subset data/${supervised_set}" - local/chain/run_tdnn_15k.sh $train_supervised_opts --remove-egs false \ + local/chain/run_tdnn_11k.sh $train_supervised_opts --remove-egs false \ --train-set $supervised_set --ivector-train-set $base_train_set \ --nnet3-affix $nnet3_affix --tdnn-affix $tdnn_affix --exp $exp fi @@ -109,64 +116,40 @@ fi unsupervised_set=${unsupervised_set}_240k for dset in $unsupervised_set; do - if [ $stage -le 3 ]; then - if [ -f data/${dset}_sp_sil_hires/feats.scp ]; then - echo "$0: data/${dset}_sp_sil_hires/feats.scp exists. Remove it re-run or skip this stage." - exit 1 - fi - - utils/data/perturb_data_dir_speed_3way.sh data/$dset data/${dset}_sp_tmp_hires - utils/data/segment_data.sh --nj 30 --cmd "$train_cmd" \ - data/${dset}_sp_tmp_hires data/${dset}_sp_seg_hires - - python utils/data/perturb_speed_to_allowed_lengths.py --no-speed-perturb \ - 12 data/${dset}_sp_seg_hires data/${dset}_sp_sil_hires - - utils/fix_data_dir.sh data/${dset}_sp_sil_hires - - steps/make_mfcc.sh --mfcc-config conf/mfcc_hires.conf \ - --write-utt2num-frames true --nj 40 --cmd "$train_cmd" \ - data/${dset}_sp_sil_hires - - steps/compute_cmvn_stats.sh \ - data/${dset}_sp_sil_hires + if [ $stage -le 3 ] && [ ! -f data/${dset}_sp_hires/feats.scp ]; then + utils/data/perturb_data_dir_speed_3way.sh data/$dset data/${dset}_sp_hires_tmp + utils/subset_data_dir.sh --utt-list data/${dset}_sp_hires_tmp/utt2spk \ + data/${base_train_set}_sp_hires data/${dset}_sp_hires fi if [ $stage -le 4 ]; then - steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2 \ - data/${dset}_sp_sil_hires data/${dset}_sp_sil_max2_hires || exit 1; - - steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \ - data/${dset}_sp_sil_max2_hires $exp/nnet3${nnet3_affix}/extractor \ - $exp/nnet3${nnet3_affix}/ivectors_${dset}_sp_sil_hires || exit 1; - fi - - if [ $stage -le 5 ] && [ ! -f $chaindir/decode_${dset}_sp_sil${decode_affix}/lat.1.gz ]; then echo "$0: getting the decoding lattices for the unsupervised subset using the chain model at: $chaindir" steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \ - --acwt 1.0 --post-decode-acwt 10.0 \ - --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${dset}_sp_sil_hires \ + --acwt 1.0 --post-decode-acwt 10.0 --write-compact false \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ --scoring-opts "--min-lmwt 10 --max-lmwt 10" \ - $graphdir data/${dset}_sp_sil_hires $chaindir/decode_${dset}_sp_sil${decode_affix} + $graphdir data/${dset}_sp_hires $chaindir/decode_${dset}_sp${decode_affix} fi - if [ $stage -le 6 ]; then - steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang_test${graph_affix} \ - data/lang_test${graph_affix}_fg data/${dset}_sp_sil_hires \ - $chaindir/decode_${dset}_sp_sil${decode_affix} \ - $chaindir/decode_${dset}_sp_sil${decode_affix}_fg + if [ $stage -le 5 ]; then + steps/lmrescore_rnnlm_lat.sh --cmd "$tfrnnlm_cmd --mem 16G" --expand-ngram true --beam "$rnnlm_beam" --inv-acwt 10 \ + --rnnlm-ver tensorflow --weight ${rnnlm_weight} --max-ngram-order 3 \ + data/lang_test${graph_affix} $rnnlm_dir \ + data/${dset}_sp_hires \ + $chaindir/decode_${dset}_sp${decode_affix} \ + $chaindir/decode_${dset}_sp${decode_affix}.${rnnlm_affix}.lat.3gram.${rnnlm_weight} - ln -s ../final.mdl $chaindir/decode_${dset}_sp_sil${decode_affix}_fg/final.mdl || true + ln -s ../final.mdl $chaindir/decode_${dset}_sp${decode_affix}.unk.fast.tfrnnlm.lat.3gram.${rnnlm_weight} || true fi done -decode_affix=${decode_affix}_fg +decode_affix=${decode_affix}.${rnnlm_affix}.lat.3gram.${rnnlm_weight} if [ $stage -le 8 ]; then steps/best_path_weights.sh --cmd "${train_cmd}" --acwt 0.1 \ - data/${unsupervised_set}_sp_sil_hires data/lang_chain \ - $chaindir/decode_${unsupervised_set}_sp_sil${decode_affix} \ - $chaindir/best_path_${unsupervised_set}_sp_sil${decode_affix} + data/${unsupervised_set}_sp_hires data/lang_chain \ + $chaindir/decode_${unsupervised_set}_sp${decode_affix} \ + $chaindir/best_path_${unsupervised_set}_sp${decode_affix} fi frame_subsampling_factor=1 @@ -178,23 +161,15 @@ cmvn_opts=`cat $chaindir/cmvn_opts` || exit 1 sup_ali_dir=$exp/tri3 treedir=$exp/chain${nnet3_affix}/tree_${tree_affix} -if [ $stage -le 9 ]; then - if [ ! -f $treedir/final.mdl ]; then - echo "$0: $treedir/final.mdl does not exist." - exit 1 - fi +if [ ! -f $treedir/final.mdl ]; then + echo "$0: $treedir/final.mdl does not exist." + exit 1 fi dir=$exp/chain${nnet3_affix}/tdnn${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} if [ $stage -le 10 ]; then - steps/subset_ali_dir.sh --cmd "$train_cmd" \ - data/${unsupervised_set} data/${unsupervised_set}_sp_sil_hires \ - $chaindir/best_path_${unsupervised_set}_sp_sil${decode_affix} \ - $chaindir/best_path_${unsupervised_set}${decode_affix} - echo $frame_subsampling_factor > $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor - - steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" --lm-opts "$lm_opts" \ + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ $dir fi @@ -292,7 +267,7 @@ else frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) fi -unsupervised_set=${unsupervised_set}_sp_sil +unsupervised_set=${unsupervised_set}_sp unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} [ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg @@ -307,19 +282,18 @@ if [ -z "$unsup_egs_dir" ]; then touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. echo "$0: generating egs from the unsupervised data" - steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + steps/nnet3/chain/get_egs_split.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ --left-tolerance $tolerance --right-tolerance $tolerance \ --left-context $left_context --right-context $right_context \ --left-context-initial $left_context_initial --right-context-final $right_context_final \ --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ --frame-subsampling-factor $frame_subsampling_factor \ - --no-chunking true \ --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ --lattice-prune-beam "$lattice_prune_beam" \ --phone-insertion-penalty "$phone_insertion_penalty" \ --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ - --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires \ - --generate-egs-scp true \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true $unsup_egs_opts \ data/${unsupervised_set}_hires $dir \ $unsup_lat_dir $unsup_egs_dir fi @@ -335,7 +309,6 @@ if [ $stage -le 14 ]; then touch $comb_egs_dir/.nodelete # keep egs around when that run dies. fi - if [ $train_stage -le -4 ]; then train_stage=-4 fi @@ -349,11 +322,11 @@ if [ $stage -le 15 ]; then --chain.xent-regularize $xent_regularize \ --chain.leaky-hmm-coefficient 0.1 \ --chain.l2-regularize 0.00005 \ - --chain.apply-deriv-weights true \ + --chain.apply-deriv-weights $apply_deriv_weights \ --chain.lm-opts="--num-extra-lm-states=2000" \ --egs.opts "--frames-overlap-per-eg 0" \ --egs.chunk-width 150 \ - --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.num-chunk-per-minibatch "150=128/300=64" \ --trainer.frames-per-iter 1500000 \ --trainer.num-epochs 4 \ --trainer.optimization.num-jobs-initial 3 \ @@ -399,6 +372,11 @@ if [ $stage -le 18 ]; then done fi +if ! $do_finetuning; then + wait + exit 0 +fi + if [ $stage -le 19 ]; then mkdir -p ${dir}${finetune_suffix} @@ -460,4 +438,3 @@ fi wait; exit 0; - diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_u.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_u.sh index fdd7a3da189..920b4317669 100755 --- a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_u.sh +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_u.sh @@ -1,7 +1,8 @@ #!/bin/bash -# This script is same as _d, but uses a weight of 1.0 for unsupervised set. -# unsup_frames_per_eg=150 +# This script is same as _t, but uses speed perturbation instead of +# silence padding for creating discrete length utterances. +# unsup_frames_per_eg= # Deriv weights: Lattice posterior of best path pdf # Unsupervised weight: 1.0 # Weights for phone LM (supervised, unsupervises): 5,2 diff --git a/egs/wsj/s5/steps/lmrescore_rnnlm_lat.sh b/egs/wsj/s5/steps/lmrescore_rnnlm_lat.sh index 5da18185f39..826dfa8a53a 100755 --- a/egs/wsj/s5/steps/lmrescore_rnnlm_lat.sh +++ b/egs/wsj/s5/steps/lmrescore_rnnlm_lat.sh @@ -17,6 +17,7 @@ weight=1.0 # Interpolation weight for RNNLM. expand_ngram=false beam= +write_compact=true # End configuration section. rnnlm_ver= #layer_string= @@ -95,27 +96,27 @@ cp $indir/num_jobs $outdir lat="ark:gunzip -c $indir/lat.JOB.gz |" if $expand_ngram; then - lat="$lat lattice-expand-ngram --n=$max_ngram_order ark:- ark:- |" + lat="$lat lattice-expand-ngram --write-compact=$write_compact --n=$max_ngram_order ark:- ark:- |" fi if [ ! -z "$beam" ]; then - lat="$lat lattice-prune --inv-acoustic-scale=$inv_acwt --beam=$beam ark:- ark:- |" + lat="$lat lattice-prune --write-compact=$write_compact --inv-acoustic-scale=$inv_acwt --beam=$beam ark:- ark:- |" fi oldlm_weight=`perl -e "print -1.0 * $weight;"` if [ "$oldlm" == "$oldlang/G.fst" ]; then $cmd JOB=1:$nj $outdir/log/rescorelm.JOB.log \ - lattice-lmrescore --lm-scale=$oldlm_weight \ + lattice-lmrescore --lm-scale=$oldlm_weight --write-compact=$write_compact \ "$lat" "$oldlm_command" ark:- \| \ - $rescoring_binary $extra_arg --lm-scale=$weight \ + $rescoring_binary $extra_arg --lm-scale=$weight --write-compact=$write_compact \ --max-ngram-order=$max_ngram_order \ $first_arg $oldlang/words.txt ark:- "$rnnlm_dir/rnnlm" \ "ark,t:|gzip -c>$outdir/lat.JOB.gz" || exit 1; else $cmd JOB=1:$nj $outdir/log/rescorelm.JOB.log \ - lattice-lmrescore-const-arpa --lm-scale=$oldlm_weight \ + lattice-lmrescore-const-arpa --lm-scale=$oldlm_weight --write-compact=$write_compact \ "$lat" "$oldlm_command" ark:- \| \ - $rescoring_binary $extra_arg --lm-scale=$weight \ + $rescoring_binary $extra_arg --lm-scale=$weight --write-compact=$write_compact \ --max-ngram-order=$max_ngram_order \ $first_arg $oldlang/words.txt ark:- "$rnnlm_dir/rnnlm" \ "ark,t:|gzip -c>$outdir/lat.JOB.gz" || exit 1; diff --git a/egs/wsj/s5/steps/nnet3/chain/get_egs_split.sh b/egs/wsj/s5/steps/nnet3/chain/get_egs_split.sh index 89211621fa9..e507c7467c0 100755 --- a/egs/wsj/s5/steps/nnet3/chain/get_egs_split.sh +++ b/egs/wsj/s5/steps/nnet3/chain/get_egs_split.sh @@ -77,6 +77,7 @@ acwt=0.1 # For pruning phone_insertion_penalty= deriv_weights_scp= generate_egs_scp=false +lat_copy_src= echo "$0 $@" # Print the command line for logging @@ -276,10 +277,15 @@ fi if [ $stage -le 2 ]; then echo "$0: copying training lattices" - $cmd --max-jobs-run 6 JOB=1:$num_lat_jobs $dir/log/lattice_copy.JOB.log \ - lattice-copy --write-compact=false "ark:gunzip -c $latdir/lat.JOB.gz|" ark,scp:$dir/lat.JOB.ark,$dir/lat.JOB.scp || exit 1; - - for id in $(seq $num_lat_jobs); do cat $dir/lat.$id.scp; done > $dir/lat.scp + if [ -z "$lat_copy_src" ]; then + $cmd --max-jobs-run 6 JOB=1:$num_lat_jobs $dir/log/lattice_copy.JOB.log \ + lattice-copy --write-compact=false "ark:gunzip -c $latdir/lat.JOB.gz|" ark,scp:$dir/lat.JOB.ark,$dir/lat.JOB.scp || exit 1; + + for id in $(seq $num_lat_jobs); do cat $dir/lat.$id.scp; done > $dir/lat.scp + else + ln -sf `readlink -f $lat_copy_src`/lat.*.{ark,scp} $dir/ + ln -sf `readlink -f $lat_copy_src`/lat.scp $dir/ + fi fi egs_opts="--left-context=$left_context --right-context=$right_context --num-frames=$frames_per_eg --frame-subsampling-factor=$frame_subsampling_factor --compress=$compress" @@ -296,8 +302,19 @@ chain_supervision_all_opts="--supervision.frame-subsampling-factor=$alignment_su chain_supervision_all_opts="$chain_supervision_all_opts --supervision.left-tolerance=$left_tolerance" normalization_scale=1.0 + +lattice_copy_cmd="ark:-" +if [ ! -z $lattice_prune_beam ]; then + if [ "$lattice_prune_beam" == "0" ] || [ "$lattice_prune_beam" == "0.0" ]; then + lattice_copy_cmd="ark:- | lattice-1best --acoustic-scale=$acwt ark:- ark:-" + else + lattice_copy_cmd="ark:- | lattice-prune --write-compact=false --acoustic-scale=$acwt --beam=$lattice_prune_beam ark:- ark:-" + fi +fi + if [ ! -z "$lattice_lm_scale" ]; then chain_supervision_all_opts="$chain_supervision_all_opts --supervision.lm-scale=$lattice_lm_scale" + normalization_scale=$(perl -e " if ($lattice_lm_scale > 1.0 || $lattice_lm_scale < 0) { print STDERR \"Invalid --lattice-lm-scale $lattice_lm_scale\"; @@ -325,11 +342,6 @@ echo $left_context > $dir/info/left_context echo $right_context > $dir/info/right_context echo $left_context_initial > $dir/info/left_context_initial echo $right_context_final > $dir/info/right_context_final - -lattice_copy_cmd="ark:-" - -[ ! -z $lattice_prune_beam ] && \ - lattice_copy_cmd="ark:- | lattice-prune --write-compact=false --acoustic-scale=$acwt --beam=$lattice_prune_beam ark:- ark:-" if [ $stage -le 3 ]; then echo "$0: Getting validation and training subset examples." diff --git a/src/chain/chain-supervision-splitter-test.cc b/src/chain/chain-supervision-splitter-test.cc index 9314da13682..ccc6cdabbfd 100644 --- a/src/chain/chain-supervision-splitter-test.cc +++ b/src/chain/chain-supervision-splitter-test.cc @@ -249,15 +249,9 @@ void ChainSupervisionSplitterTest(int32 index) { delete trans_model; } -void TestToleranceFst() { +void TestToleranceFst(chain::SupervisionOptions &sup_opts, int32 num_phones) { ContextDependency *ctx_dep; - TransitionModel *trans_model = GetSimpleChainTransitionModel(&ctx_dep, 2); - - chain::SupervisionOptions sup_opts; - sup_opts.left_tolerance = 1; - sup_opts.right_tolerance = 1; - sup_opts.frame_subsampling_factor = 1; - sup_opts.lm_scale = 0.5; + TransitionModel *trans_model = GetSimpleChainTransitionModel(&ctx_dep, num_phones); fst::StdVectorFst tolerance_fst; GetToleranceEnforcerFst(sup_opts, *trans_model, &tolerance_fst); @@ -272,11 +266,30 @@ void TestToleranceFst() { } // namespace chain } // namespace kaldi -int main() { +int main(int argc, char *argv[]) { using namespace kaldi; SetVerboseLevel(2); - kaldi::chain::TestToleranceFst(); + const char *usage = "chain-supervision-test [options]"; + + ParseOptions po(usage); + + int32 num_phones = 1; + + po.Register("num-phones", &num_phones, + "Number of phones"); + + chain::SupervisionOptions sup_opts; + sup_opts.left_tolerance = 1; + sup_opts.right_tolerance = 1; + sup_opts.frame_subsampling_factor = 1; + sup_opts.lm_scale = 0.5; + + sup_opts.Register(&po); + + po.Read(argc, argv); + + kaldi::chain::TestToleranceFst(sup_opts, num_phones); return 0; for (int32 i = 0; i < 10; i++) { diff --git a/src/latbin/lattice-expand-ngram.cc b/src/latbin/lattice-expand-ngram.cc index 1b8cfbee24b..6c49fab9daa 100644 --- a/src/latbin/lattice-expand-ngram.cc +++ b/src/latbin/lattice-expand-ngram.cc @@ -20,6 +20,7 @@ #include "util/common-utils.h" #include "fstext/fstext-lib.h" #include "lat/kaldi-lattice.h" +#include "lat/lattice-functions.h" int main(int argc, char *argv[]) { try { @@ -38,9 +39,11 @@ int main(int argc, char *argv[]) { "e.g.: lattice-expand-ngram --n=3 ark:lat ark:expanded_lat\n"; ParseOptions po(usage); + bool write_compact = true; int32 n = 3; std::string word_syms_filename; + po.Register("write-compact", &write_compact, "If true, write in normal (compact) form."); po.Register("n", &n, "n-gram context to expand to."); po.Read(argc, argv); @@ -56,33 +59,74 @@ int main(int argc, char *argv[]) { lats_wspecifier = po.GetOptArg(2); fst::UnweightedNgramFst expand_fst(n); + + SequentialCompactLatticeReader compact_lattice_reader; + SequentialLatticeReader lattice_reader; - SequentialCompactLatticeReader lat_reader(lats_rspecifier); - CompactLatticeWriter lat_writer(lats_wspecifier); + CompactLatticeWriter compact_lattice_writer; + LatticeWriter lattice_writer; + + if (write_compact) { + compact_lattice_reader.Open(lats_rspecifier); + compact_lattice_writer.Open(lats_wspecifier); + } else { + lattice_reader.Open(lats_rspecifier); + lattice_writer.Open(lats_wspecifier); + } int32 n_done = 0, n_fail = 0; - for (; !lat_reader.Done(); lat_reader.Next()) { - std::string key = lat_reader.Key(); + for (; write_compact ? !compact_lattice_reader.Done() : !lattice_reader.Done(); + write_compact ? compact_lattice_reader.Next() : lattice_reader.Next()) { + std::string key; + CompactLattice clat; + + // Compute a map from each (t, tid) to (sum_of_acoustic_scores, count) + unordered_map, std::pair, + PairHasher > acoustic_scores; + KALDI_LOG << "Processing lattice for key " << key; - CompactLattice lat = lat_reader.Value(); + if (write_compact) { + key = compact_lattice_reader.Key(); + clat = compact_lattice_reader.Value(); + compact_lattice_reader.FreeCurrent(); + } else { + key = lattice_reader.Key(); + const Lattice &lat = lattice_reader.Value(); + + // Compute a map from each (t, tid) to (sum_of_acoustic_scores, count) + ComputeAcousticScoresMap(lat, &acoustic_scores); + + ConvertLattice(lat, &clat); + + lattice_reader.FreeCurrent(); + } CompactLattice expanded_lat; - ComposeDeterministicOnDemand(lat, &expand_fst, &expanded_lat); + ComposeDeterministicOnDemand(clat, &expand_fst, &expanded_lat); if (expanded_lat.Start() == fst::kNoStateId) { KALDI_WARN << "Empty lattice for utterance " << key << std::endl; n_fail++; } else { - if (lat.NumStates() == expanded_lat.NumStates()) { + if (clat.NumStates() == expanded_lat.NumStates()) { KALDI_LOG << "Lattice for key " << key << " did not need to be expanded for order " << n << "."; } else { - KALDI_LOG << "Lattice expanded from " << lat.NumStates() << " to " + KALDI_LOG << "Lattice expanded from " << clat.NumStates() << " to " << expanded_lat.NumStates() << " states for order " << n << "."; } - lat_writer.Write(key, expanded_lat); + if (write_compact) { + compact_lattice_writer.Write(key, expanded_lat); + } else { + Lattice out_lat; + fst::ConvertLattice(expanded_lat, &out_lat); + + // Replace each arc (t, tid) with the averaged acoustic score from + // the computed map + ReplaceAcousticScoresFromMap(acoustic_scores, &out_lat); + lattice_writer.Write(key, out_lat); + } n_done++; } - lat_reader.FreeCurrent(); } KALDI_LOG << "Processed " << n_done << " lattices with " << n_fail << " failures."; diff --git a/src/tfrnnlmbin/lattice-lmrescore-tf-rnnlm.cc b/src/tfrnnlmbin/lattice-lmrescore-tf-rnnlm.cc index 6b0ba07926d..666c8cf0fde 100644 --- a/src/tfrnnlmbin/lattice-lmrescore-tf-rnnlm.cc +++ b/src/tfrnnlmbin/lattice-lmrescore-tf-rnnlm.cc @@ -44,9 +44,11 @@ int main(int argc, char *argv[]) { " words.txt ark:in.lats rnnlm ark:out.lats\n"; ParseOptions po(usage); + bool write_compact = true; int32 max_ngram_order = 3; BaseFloat lm_scale = 1.0; + po.Register("write-compact", &write_compact, "If true, write in normal (compact) form."); po.Register("lm-scale", &lm_scale, "Scaling factor for language model " "costs"); po.Register("max-ngram-order", &max_ngram_order, @@ -84,15 +86,45 @@ int main(int argc, char *argv[]) { KaldiTfRnnlmWrapper rnnlm(opts, rnn_word_list, word_symbols_rxfilename, unk_prob_file, rnnlm_rxfilename); - // Reads and writes as compact lattice. - SequentialCompactLatticeReader compact_lattice_reader(lats_rspecifier); - CompactLatticeWriter compact_lattice_writer(lats_wspecifier); + SequentialCompactLatticeReader compact_lattice_reader; + SequentialLatticeReader lattice_reader; + + CompactLatticeWriter compact_lattice_writer; + LatticeWriter lattice_writer; + + if (write_compact) { + compact_lattice_reader.Open(lats_rspecifier); + compact_lattice_writer.Open(lats_wspecifier); + } else { + lattice_reader.Open(lats_rspecifier); + lattice_writer.Open(lats_wspecifier); + } int32 n_done = 0, n_fail = 0; - for (; !compact_lattice_reader.Done(); compact_lattice_reader.Next()) { - std::string key = compact_lattice_reader.Key(); - CompactLattice clat = compact_lattice_reader.Value(); - compact_lattice_reader.FreeCurrent(); + for (; write_compact ? !compact_lattice_reader.Done() : !lattice_reader.Done(); + write_compact ? compact_lattice_reader.Next() : lattice_reader.Next()) { + std::string key; + CompactLattice clat; + + // Compute a map from each (t, tid) to (sum_of_acoustic_scores, count) + unordered_map, std::pair, + PairHasher > acoustic_scores; + + if (write_compact) { + key = compact_lattice_reader.Key(); + clat = compact_lattice_reader.Value(); + compact_lattice_reader.FreeCurrent(); + } else { + key = lattice_reader.Key(); + const Lattice &lat = lattice_reader.Value(); + + // Compute a map from each (t, tid) to (sum_of_acoustic_scores, count) + ComputeAcousticScoresMap(lat, &acoustic_scores); + + ConvertLattice(lat, &clat); + + lattice_reader.FreeCurrent(); + } if (lm_scale != 0.0) { // Before composing with the LM FST, we scale the lattice weights @@ -123,13 +155,34 @@ int main(int argc, char *argv[]) { << " (incompatible LM?)"; n_fail++; } else { - compact_lattice_writer.Write(key, determinized_clat); + if (write_compact) { + compact_lattice_writer.Write(key, determinized_clat); + } else { + Lattice out_lat; + fst::ConvertLattice(determinized_clat, &out_lat); + + // Replace each arc (t, tid) with the averaged acoustic score from + // the computed map + ReplaceAcousticScoresFromMap(acoustic_scores, &out_lat); + lattice_writer.Write(key, out_lat); + } n_done++; } } else { // Zero scale so nothing to do. n_done++; - compact_lattice_writer.Write(key, clat); + + if (write_compact) { + compact_lattice_writer.Write(key, clat); + } else { + Lattice out_lat; + fst::ConvertLattice(clat, &out_lat); + + // Replace each arc (t, tid) with the averaged acoustic score from + // the computed map + ReplaceAcousticScoresFromMap(acoustic_scores, &out_lat); + lattice_writer.Write(key, out_lat); + } } } From c22bd485520a0c338e29447bb66e9be5702a78a5 Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Fri, 20 Oct 2017 13:32:00 -0400 Subject: [PATCH 079/174] Fix tolerance fst --- .../chain/tuning/run_tdnn_15k_best_path_b.sh | 10 ++- .../run_tdnn_15k_semisupervised_conf_t.sh | 2 +- src/chain/chain-supervision-splitter.cc | 84 ++++++++++++------- 3 files changed, 63 insertions(+), 33 deletions(-) diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_best_path_b.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_best_path_b.sh index b01cd361c22..88988be9d1c 100755 --- a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_best_path_b.sh +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_best_path_b.sh @@ -157,17 +157,21 @@ if [ ! -f $treedir/final.mdl ]; then exit 1 fi -if [ $stage -le 9 ]; then - this_nj=$(cat $chaindir/decode_${unsupervised_set}_sp${decode_affix}_fg/num_jobs) +this_nj=$(cat $chaindir/decode_${unsupervised_set}_sp${decode_affix}_fg/num_jobs) +if [ $stage -le 9 ]; then out_dir=$chaindir/best_path_lats_${unsupervised_set}_sp${decode_affix}_fg $train_cmd JOB=1:$this_nj $out_dir/log/get_best_path_lats.JOB.log \ lattice-interp "ark:gunzip -c $chaindir/decode_${unsupervised_set}_sp${decode_affix}/lat.JOB.gz |" \ "ark:gunzip -c $chaindir/decode_${unsupervised_set}_sp${decode_affix}_fg/lat.JOB.gz | lattice-1best ark:- ark:- |" \ "ark:| gzip -c > $out_dir/lat.JOB.gz" - echo $this_nj/$out_dir/num_jobs + echo $this_nj > $out_dir/num_jobs fi + +ln -sf ../final.mdl $chaindir/best_path_lats_${unsupervised_set}_sp${decode_affix}_fg/final.mdl + +echo $this_nj > $chaindir/best_path_lats_${unsupervised_set}_sp${decode_affix}_fg/num_jobs decode_affix=${decode_affix}_fg diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_t.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_t.sh index b90008afaf5..591df3c8aab 100644 --- a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_t.sh +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_t.sh @@ -34,7 +34,7 @@ graph_affix=_ex250k # can be used to decode the unsup data with another lm/gra phone_insertion_penalty= # Semi-supervised options -comb_affix=comb1n # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +comb_affix=comb1t # affix for new chain-model directory trained on the combined supervised+unsupervised subsets supervision_weights=1.0,1.0 lm_weights=5,2 sup_egs_dir= diff --git a/src/chain/chain-supervision-splitter.cc b/src/chain/chain-supervision-splitter.cc index edff10a4648..5fff6269835 100644 --- a/src/chain/chain-supervision-splitter.cc +++ b/src/chain/chain-supervision-splitter.cc @@ -561,26 +561,42 @@ void ToleranceEnforcerFstCreator::AddArcsForForwardTransition( int32 self_loop_tid = trans_model_.SelfLoopOf(tstate); int32 self_loop_pdf = trans_model_.TransitionIdToPdf(self_loop_tid); + // self-loop accepting forward-tid + fst_->AddArc(init_state, + fst::StdArc(trans_id, forward_pdf + 1, + fst::TropicalWeight::One(), + init_state)); + for (int32 i = 1; i <= 3; i++) { StateId next_state = GetStateId(offset, forward_id, i); - // accept a forward transition from initial state - fst_->AddArc(init_state, - fst::StdArc(trans_id, forward_pdf + 1, - fst::TropicalWeight::One(), - next_state)); - - // epsilon-arc to initial state - fst_->AddArc(next_state, - fst::StdArc(0, 0, - fst::TropicalWeight::One(), - init_state)); + if (i == kDeletion || i == kInsertion) { + // epsilon-arc to initial state + fst_->AddArc(next_state, + fst::StdArc(0, 0, + fst::TropicalWeight::One(), + init_state)); + } - // self-loop - fst_->AddArc(next_state, - fst::StdArc(self_loop_tid, self_loop_pdf + 1, - fst::TropicalWeight::One(), - next_state)); + if (i == kAccept) { + // accept a forward transition from initial state + fst_->AddArc(init_state, + fst::StdArc(trans_id, forward_pdf + 1, + fst::TropicalWeight::One(), + next_state)); + + // self-loop accepting self-loop tid + fst_->AddArc(next_state, + fst::StdArc(self_loop_tid, self_loop_pdf + 1, + fst::TropicalWeight::One(), + next_state)); + + // self-loop transition back to initial state + fst_->AddArc(next_state, + fst::StdArc(self_loop_tid, self_loop_pdf + 1, + fst::TropicalWeight::One(), + init_state)); + } } } @@ -593,20 +609,31 @@ void ToleranceEnforcerFstCreator::AddArcsBetweenOffsets( int32 self_loop_pdf = trans_model_.TransitionIdToPdf(self_loop_tid); if (offset > -opts_.left_tolerance) { - StateId state = GetStateId(offset, forward_id, kDeletion); + StateId accept_state = GetStateId(offset, forward_id, kAccept); + StateId delete_state = GetStateId(offset, forward_id, kDeletion); StateId next_state = GetStateId(offset - 1, forward_id, kDeletion); - fst_->AddArc(state, + fst_->AddArc(accept_state, + fst::StdArc(self_loop_tid, 0, + fst::TropicalWeight::One(), + next_state)); + fst_->AddArc(delete_state, fst::StdArc(self_loop_tid, 0, fst::TropicalWeight::One(), next_state)); } if (offset < opts_.right_tolerance) { - StateId state = GetStateId(offset, forward_id, kInsertion); + StateId accept_state = GetStateId(offset, forward_id, kAccept); + StateId insert_state = GetStateId(offset, forward_id, kInsertion); StateId next_state = GetStateId(offset + 1, forward_id, kInsertion); - fst_->AddArc(state, + fst_->AddArc(accept_state, + fst::StdArc(0, self_loop_pdf + 1, + fst::TropicalWeight::One(), + next_state)); + + fst_->AddArc(insert_state, fst::StdArc(0, self_loop_pdf + 1, fst::TropicalWeight::One(), next_state)); @@ -614,20 +641,18 @@ void ToleranceEnforcerFstCreator::AddArcsBetweenOffsets( if (offset == 0) { if (forward_id == 0) { - StateId state = GetStateId(offset, forward_id, kInit); + StateId init_state = GetStateId(offset, forward_id, kInit); fst_->AddArc(0, fst::StdArc(0, 0, fst::TropicalWeight::One(), - state)); + init_state)); } - for (int32 i = 1; i <= 3; i++) { - StateId next_state = GetStateId(offset, forward_id, i); - fst_->AddArc(0, - fst::StdArc(self_loop_tid, self_loop_pdf + 1, - fst::TropicalWeight::One(), - next_state)); - } + StateId next_state = GetStateId(offset, forward_id, kAccept); + fst_->AddArc(0, + fst::StdArc(0, 0, + fst::TropicalWeight::One(), + next_state)); } } @@ -657,6 +682,7 @@ void ToleranceEnforcerFstCreator::MakeFst() { AddArcsForOffset(o); } + fst::Connect(fst_); fst::ArcSort(fst_, fst::ILabelCompare()); } From 0d8af581c1d1dee8153414577a7f28ce27303394 Mon Sep 17 00:00:00 2001 From: System User Date: Fri, 20 Oct 2017 13:32:44 -0400 Subject: [PATCH 080/174] Minor fix to _m --- .../chain/tuning/run_tdnn_11k_semisupervised_conf_m.sh | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_m.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_m.sh index 0c306dfa05c..26fdaec4334 100644 --- a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_m.sh +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_m.sh @@ -85,7 +85,7 @@ fi if false && [ $stage -le 1 ]; then echo "$0: chain training on the supervised subset data/${supervised_set}" - local/chain/run_tdnn_11k.sh $train_supervised_opts --remove-egs false \ + local/chain/run_tdnn_15k.sh $train_supervised_opts --remove-egs false \ --train-set $supervised_set --ivector-train-set $base_train_set \ --nnet3-affix $nnet3_affix --tdnn-affix $tdnn_affix --exp $exp fi @@ -238,6 +238,8 @@ if [ -z "$sup_egs_dir" ]; then utils/create_split_dir.pl \ /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage fi + + mkdir -p $sup_egs_dir touch $sup_egs_dir/.nodelete # keep egs around when that run dies. echo "$0: generating egs from the supervised data" @@ -270,6 +272,8 @@ if [ -z "$unsup_egs_dir" ]; then utils/create_split_dir.pl \ /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage fi + + mkdir -p $unsup_egs_dir touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. echo "$0: generating egs from the unsupervised data" From 5bfdd394bbe86ae123390bb509f05fd516c8e5a5 Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Sun, 22 Oct 2017 14:51:53 -0400 Subject: [PATCH 081/174] Tolerance fst fixed --- src/chain/chain-supervision-splitter.cc | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/src/chain/chain-supervision-splitter.cc b/src/chain/chain-supervision-splitter.cc index 5fff6269835..4211a17645a 100644 --- a/src/chain/chain-supervision-splitter.cc +++ b/src/chain/chain-supervision-splitter.cc @@ -641,18 +641,25 @@ void ToleranceEnforcerFstCreator::AddArcsBetweenOffsets( if (offset == 0) { if (forward_id == 0) { + // Add arc from start state to the offset 0 initial state. + // This is the normal case when there is no partial phone in the lattice. StateId init_state = GetStateId(offset, forward_id, kInit); - fst_->AddArc(0, - fst::StdArc(0, 0, - fst::TropicalWeight::One(), - init_state)); + fst_->AddArc(0, fst::StdArc(0, 0, + fst::TropicalWeight::One(), + init_state)); } - StateId next_state = GetStateId(offset, forward_id, kAccept); - fst_->AddArc(0, - fst::StdArc(0, 0, - fst::TropicalWeight::One(), - next_state)); + // Add self-loop on start state accepting the self-loop transition + // of a partial phone. + fst_->AddArc(0, fst::StdArc(self_loop_tid, self_loop_pdf + 1, + fst::TropicalWeight::One(), + 0)); + + // Add arc from start state deleting a self-loop transition + StateId next_state = GetStateId(offset - 1, forward_id, kDeletion); + fst_->AddArc(0, fst::StdArc(self_loop_tid, 0, + fst::TropicalWeight::One(), + next_state)); } } From 479e769e073965834fcaa25b84ff3c3248aafe85 Mon Sep 17 00:00:00 2001 From: System User Date: Fri, 27 Oct 2017 12:20:19 -0400 Subject: [PATCH 082/174] semisup: Fixing some bugs and making cleaner scripts --- .../run_tdnn_11k_semisupervised_conf_m.sh | 3 +- .../run_tdnn_11k_semisupervised_conf_n.sh | 5 +- .../run_tdnn_15k_semisupervised_conf_w.sh | 428 ++++++++++++++++++ .../run_tdnn_15k_semisupervised_conf_x.sh | 427 +++++++++++++++++ .../run_tdnn_50k_semisupervised_conf_f.sh | 23 +- .../run_tdnn_50k_semisupervised_conf_g.sh | 52 ++- .../nnet3/train/chain_objf/acoustic_model.py | 6 +- egs/wsj/s5/steps/nnet2/remove_egs.sh | 2 +- egs/wsj/s5/steps/nnet3/chain/train.py | 3 +- src/chain/chain-supervision-splitter.cc | 16 +- src/chain/chain-supervision-test.cc | 8 +- src/chain/chain-supervision.cc | 4 +- src/latbin/lattice-determinize-non-compact.cc | 95 ---- 13 files changed, 923 insertions(+), 149 deletions(-) create mode 100644 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_w.sh create mode 100644 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_x.sh diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_m.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_m.sh index f10fdd3ddc6..6c3fd38deff 100644 --- a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_m.sh +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_m.sh @@ -19,6 +19,7 @@ exp=exp/semisup_15k unsupervised_set=train_unsup250k # set this to your choice of unsupervised data supervised_set=train_sup15k semi_affix=semi15k_250k # affix relating train-set splitting proportion +apply_deriv_weights=true tdnn_affix=7b # affix for the supervised chain-model directory train_supervised_opts="--stage -10 --train-stage -10" @@ -318,7 +319,7 @@ if [ $stage -le 15 ]; then --chain.xent-regularize $xent_regularize \ --chain.leaky-hmm-coefficient 0.1 \ --chain.l2-regularize 0.00005 \ - --chain.apply-deriv-weights true \ + --chain.apply-deriv-weights $apply_deriv_weights \ --chain.lm-opts="--num-extra-lm-states=2000" \ --egs.opts "--frames-overlap-per-eg 0" \ --egs.chunk-width 150 \ diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_n.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_n.sh index 42fa4e68283..dab7eb692d8 100644 --- a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_n.sh +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_n.sh @@ -43,6 +43,7 @@ tree_affix= unsup_egs_opts= apply_deriv_weights=true +do_finetuning=false extra_left_context=0 extra_right_context=0 @@ -241,6 +242,7 @@ if [ -z "$sup_egs_dir" ]; then utils/create_split_dir.pl \ /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage fi + mkdir -p $sup_egs_dir/ touch $sup_egs_dir/.nodelete # keep egs around when that run dies. echo "$0: generating egs from the supervised data" @@ -273,10 +275,11 @@ if [ -z "$unsup_egs_dir" ]; then utils/create_split_dir.pl \ /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage fi + mkdir -p $unsup_egs_dir touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. echo "$0: generating egs from the unsupervised data" - steps/nnet3/chain/get_egs_split.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + steps/nnet3/chain/get_egs_split.sh --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ --left-tolerance $tolerance --right-tolerance $tolerance \ --left-context $left_context --right-context $right_context \ --left-context-initial $left_context_initial --right-context-final $right_context_final \ diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_w.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_w.sh new file mode 100644 index 00000000000..079575bdcc8 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_w.sh @@ -0,0 +1,428 @@ +#!/bin/bash + +# This script is same as _n, but uses trigram for decoding. +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 5,2 +# LM for decoding unsupervised data: 3gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup15k_250k # for reference +exp=exp/semisup_15k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup15k +semi_affix=semi15k_250k # affix relating train-set splitting proportion + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1w # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix= +unsup_egs_opts= +apply_deriv_weights=true + +do_finetuning=false + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +tree_affix= + +RANDOM=0 + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri3_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs_split.sh --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=128/300=64" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${lang_test_suffix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_x.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_x.sh new file mode 100644 index 00000000000..0ff5fef3be7 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_x.sh @@ -0,0 +1,427 @@ +#!/bin/bash + +# This script is same as _m, but uses 3gram LM for decoding +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 5,2 +# LM for decoding unsupervised data: 3gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup15k_250k # for reference +exp=exp/semisup_15k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup15k +semi_affix=semi15k_250k # affix relating train-set splitting proportion +apply_deriv_weights=true + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1x # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix= +unsup_egs_opts= + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +tree_affix= + +RANDOM=0 + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri3_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + + mkdir -p $sup_egs_dir + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=128/300=64" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${lang_test_suffix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_f.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_f.sh index 3af0410dbe1..4915521d72c 100755 --- a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_f.sh +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_f.sh @@ -23,9 +23,6 @@ semi_affix=semi50k_250k # affix relating train-set splitting proportion tdnn_affix=7b # affix for the supervised chain-model directory train_supervised_opts="--stage -10 --train-stage -10" -lm_opts= -do_finetuning=false - # Unsupervised options decode_affix= egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir @@ -35,7 +32,6 @@ lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting e tolerance=1 graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph phone_insertion_penalty= -unsup_egs_opts= # Semi-supervised options comb_affix=comb1f # affix for new chain-model directory trained on the combined supervised+unsupervised subsets @@ -44,8 +40,11 @@ lm_weights=3,2 sup_egs_dir= unsup_egs_dir= comb_egs_dir= -apply_deriv_weights=true tree_affix= +unsup_egs_opts= +apply_deriv_weights=true + +do_finetuning=false extra_left_context=0 extra_right_context=0 @@ -76,6 +75,7 @@ echo "$0 $@" # Print the command line for logging nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs decode_affix=${decode_affix}${graph_affix} egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +tree_affix= RANDOM=0 @@ -158,18 +158,15 @@ cmvn_opts=`cat $chaindir/cmvn_opts` || exit 1 sup_ali_dir=$exp/tri4a treedir=$exp/chain${nnet3_affix}/tree_${tree_affix} -if [ $stage -le 9 ]; then - if [ -f $treedir/final.mdl ]; then - echo "$0: $treedir/final.mdl already exists. Remove it and try again." - exit 1 - fi +if [ ! -f $treedir/final.mdl ]; then + echo "$0: $treedir/final.mdl does not exist." + exit 1 fi dir=$exp/chain${nnet3_affix}/tdnn${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} if [ $stage -le 10 ]; then steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ - --lm-opts "--num-extra-lm-states=2000" \ ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ $dir fi @@ -247,6 +244,7 @@ if [ -z "$comb_egs_dir" ] && [ -z "$sup_egs_dir" ]; then utils/create_split_dir.pl \ /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage fi + mkdir -p $sup_egs_dir/ touch $sup_egs_dir/.nodelete # keep egs around when that run dies. echo "$0: generating egs from the supervised data" @@ -281,10 +279,11 @@ if [ -z "$comb_egs_dir" ] && [ -z "$unsup_egs_dir" ]; then utils/create_split_dir.pl \ /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage fi + mkdir -p $unsup_egs_dir touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. echo "$0: generating egs from the unsupervised data" - steps/nnet3/chain/get_egs_split.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + steps/nnet3/chain/get_egs_split.sh --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ --left-tolerance $tolerance --right-tolerance $tolerance \ --left-context $left_context --right-context $right_context \ --left-context-initial $left_context_initial --right-context-final $right_context_final \ diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_g.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_g.sh index 3679842c877..3847e02725c 100755 --- a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_g.sh +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_g.sh @@ -23,9 +23,6 @@ semi_affix=semi50k_250k # affix relating train-set splitting proportion tdnn_affix=7b # affix for the supervised chain-model directory train_supervised_opts="--stage -10 --train-stage -10" -lm_opts= -do_finetuning=false - # Unsupervised options decode_affix= egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir @@ -35,7 +32,6 @@ lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting e tolerance=1 graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph phone_insertion_penalty= -unsup_egs_opts= # Semi-supervised options comb_affix=comb1g # affix for new chain-model directory trained on the combined supervised+unsupervised subsets @@ -43,7 +39,12 @@ supervision_weights=1.0,1.0 lm_weights=3,2 sup_egs_dir= unsup_egs_dir= +comb_egs_dir= tree_affix= +unsup_egs_opts= +apply_deriv_weights=true + +do_finetuning=false extra_left_context=0 extra_right_context=0 @@ -74,6 +75,7 @@ echo "$0 $@" # Print the command line for logging nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs decode_affix=${decode_affix}${graph_affix} egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +tree_affix= RANDOM=0 @@ -156,18 +158,15 @@ cmvn_opts=`cat $chaindir/cmvn_opts` || exit 1 sup_ali_dir=$exp/tri4a treedir=$exp/chain${nnet3_affix}/tree_${tree_affix} -if [ $stage -le 9 ]; then - if [ -f $treedir/final.mdl ]; then - echo "$0: $treedir/final.mdl already exists. Remove it and try again." - exit 1 - fi +if [ ! -f $treedir/final.mdl ]; then + echo "$0: $treedir/final.mdl does not exist." + exit 1 fi dir=$exp/chain${nnet3_affix}/tdnn${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} if [ $stage -le 10 ]; then steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ - --lm-opts "--num-extra-lm-states=2000" \ ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ $dir fi @@ -236,7 +235,7 @@ right_context_final=`perl -e "print int($right_context_final + $frame_subsamplin supervised_set=${supervised_set}_sp sup_lat_dir=$exp/chain${nnet3_affix}/tri4a_${supervised_set}_lats -if [ -z "$sup_egs_dir" ]; then +if [ -z "$comb_egs_dir" ] && [ -z "$sup_egs_dir" ]; then sup_egs_dir=$dir/egs_${supervised_set} frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) @@ -245,6 +244,7 @@ if [ -z "$sup_egs_dir" ]; then utils/create_split_dir.pl \ /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage fi + mkdir -p $sup_egs_dir/ touch $sup_egs_dir/.nodelete # keep egs around when that run dies. echo "$0: generating egs from the supervised data" @@ -262,14 +262,16 @@ if [ -z "$sup_egs_dir" ]; then $sup_lat_dir $sup_egs_dir fi else - frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) + if [ -z "$comb_egs_dir" ]; then + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) + fi fi unsupervised_set=${unsupervised_set}_sp unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} -[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg -if [ -z "$unsup_egs_dir" ]; then +if [ -z "$comb_egs_dir" ] && [ -z "$unsup_egs_dir" ]; then + [ -z $unsup_frames_per_eg ] && [ ! -z "$frames_per_eg" ] && unsup_frames_per_eg=$frames_per_eg unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} if [ $stage -le 13 ]; then @@ -277,10 +279,11 @@ if [ -z "$unsup_egs_dir" ]; then utils/create_split_dir.pl \ /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage fi + mkdir -p $unsup_egs_dir touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. echo "$0: generating egs from the unsupervised data" - steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ --left-tolerance $tolerance --right-tolerance $tolerance \ --left-context $left_context --right-context $right_context \ --left-context-initial $left_context_initial --right-context-final $right_context_final \ @@ -297,17 +300,18 @@ if [ -z "$unsup_egs_dir" ]; then fi fi -comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi +if [ -z "$comb_egs_dir" ]; then + comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi -if [ $stage -le 14 ]; then - steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ - --minibatch-size 128 --frames-per-iter 1500000 \ - --lang2weight $supervision_weights --egs-prefix cegs. 2 \ - $sup_egs_dir $unsup_egs_dir $comb_egs_dir - touch $comb_egs_dir/.nodelete # keep egs around when that run dies. + if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. + fi fi - if [ $train_stage -le -4 ]; then train_stage=-4 fi @@ -321,7 +325,7 @@ if [ $stage -le 15 ]; then --chain.xent-regularize $xent_regularize \ --chain.leaky-hmm-coefficient 0.1 \ --chain.l2-regularize 0.00005 \ - --chain.apply-deriv-weights true \ + --chain.apply-deriv-weights $apply_deriv_weights \ --chain.lm-opts="--num-extra-lm-states=2000" \ --egs.opts "--frames-overlap-per-eg 0" \ --egs.chunk-width 150 \ diff --git a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py index b415a44ea16..97bd20b9ffd 100644 --- a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py +++ b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py @@ -384,9 +384,11 @@ def train_one_iteration(dir, iter, srand, egs_dir, os.remove("{0}/cache.{1}".format(dir, iter)) -def check_for_required_files(feat_dir, tree_dir, lat_dir): +def check_for_required_files(feat_dir, tree_dir, lat_dir=None): files = ['{0}/feats.scp'.format(feat_dir), '{0}/ali.1.gz'.format(tree_dir), - '{0}/final.mdl'.format(tree_dir), '{0}/tree'.format(tree_dir), + '{0}/final.mdl'.format(tree_dir), '{0}/tree'.format(tree_dir)] + if lat_dir is not None: + files += [ '{0}/lat.1.gz'.format(lat_dir), '{0}/final.mdl'.format(lat_dir), '{0}/num_jobs'.format(lat_dir)] for file in files: diff --git a/egs/wsj/s5/steps/nnet2/remove_egs.sh b/egs/wsj/s5/steps/nnet2/remove_egs.sh index 143a5d0d86a..f8e37d86a11 100755 --- a/egs/wsj/s5/steps/nnet2/remove_egs.sh +++ b/egs/wsj/s5/steps/nnet2/remove_egs.sh @@ -35,7 +35,7 @@ fi -for f in $egs/egs.*.ark $egs/degs.*.ark $egs/cegs.*.ark; do +for f in $egs/egs.*.ark $egs/degs.*.ark $egs/cegs{,_orig}.*.ark; do if [ -L $f ]; then rm $(dirname $f)/$(readlink $f) # this will print a warning if it fails. fi diff --git a/egs/wsj/s5/steps/nnet3/chain/train.py b/egs/wsj/s5/steps/nnet3/chain/train.py index 9d94bebe4f7..1a4ddb477ec 100755 --- a/egs/wsj/s5/steps/nnet3/chain/train.py +++ b/egs/wsj/s5/steps/nnet3/chain/train.py @@ -324,7 +324,8 @@ def train(args, run_opts): # Check files chain_lib.check_for_required_files(args.feat_dir, args.tree_dir, - args.lat_dir) + args.lat_dir if args.egs_dir is None + else None) # Set some variables. num_jobs = common_lib.get_number_of_jobs(args.tree_dir) diff --git a/src/chain/chain-supervision-splitter.cc b/src/chain/chain-supervision-splitter.cc index 4211a17645a..d5c7c118c97 100644 --- a/src/chain/chain-supervision-splitter.cc +++ b/src/chain/chain-supervision-splitter.cc @@ -110,6 +110,7 @@ bool SupervisionLatticeSplitter::GetFrameRangeSupervision( *out_lat = lat_out; } + // Apply lm-scale on the lattice and remove the acoustic costs ScaleLattice(fst::LatticeScale(sup_opts_.lm_scale, 0.0), &lat_out); supervision->frames_per_sequence = num_frames; @@ -655,11 +656,13 @@ void ToleranceEnforcerFstCreator::AddArcsBetweenOffsets( fst::TropicalWeight::One(), 0)); - // Add arc from start state deleting a self-loop transition - StateId next_state = GetStateId(offset - 1, forward_id, kDeletion); - fst_->AddArc(0, fst::StdArc(self_loop_tid, 0, - fst::TropicalWeight::One(), - next_state)); + if (offset > -opts_.left_tolerance) { + // Add arc from start state deleting a self-loop transition + StateId next_state = GetStateId(offset - 1, forward_id, kDeletion); + fst_->AddArc(0, fst::StdArc(self_loop_tid, 0, + fst::TropicalWeight::One(), + next_state)); + } } } @@ -673,7 +676,6 @@ void ToleranceEnforcerFstCreator::AddArcsForOffset(int32 offset) { forward_id++; } } - } void ToleranceEnforcerFstCreator::MakeFst() { @@ -689,6 +691,8 @@ void ToleranceEnforcerFstCreator::MakeFst() { AddArcsForOffset(o); } + if (GetVerboseLevel() > 3) { WriteFstKaldi(std::cerr, false, *fst_); } + fst::Connect(fst_); fst::ArcSort(fst_, fst::ILabelCompare()); } diff --git a/src/chain/chain-supervision-test.cc b/src/chain/chain-supervision-test.cc index 1d055ab96ec..27c9ccaf438 100644 --- a/src/chain/chain-supervision-test.cc +++ b/src/chain/chain-supervision-test.cc @@ -389,9 +389,9 @@ void ChainSmbrTrainingTest(const DenominatorGraph &den_graph, opts.use_smbr_objective = true; opts.mmi_factor = 0.0; opts.smbr_factor = 1.0; - BaseFloat objf, l2_term, weight; + BaseFloat objf, mmi_objf = 0.0, l2_term, weight; ComputeChainSmbrObjfAndDeriv(opts, den_graph, supervision, - nnet_output, &objf, &l2_term, &weight, + nnet_output, &objf, &mmi_objf, &l2_term, &weight, &nnet_output_deriv); { @@ -426,11 +426,11 @@ void ChainSmbrTrainingTest(const DenominatorGraph &den_graph, CuMatrix nnet_output_perturbed(nnet_delta_output); nnet_output_perturbed.AddMat(1.0, nnet_output); - BaseFloat objf_modified, l2_term_modified, weight_modified; + BaseFloat objf_modified, mmi_objf_modified, l2_term_modified, weight_modified; ComputeChainSmbrObjfAndDeriv(opts, den_graph, supervision, nnet_output_perturbed, - &objf_modified, &l2_term_modified, + &objf_modified, &mmi_objf_modified, &l2_term_modified, &weight_modified, NULL); diff --git a/src/chain/chain-supervision.cc b/src/chain/chain-supervision.cc index 42d4a57bbdf..4f52f1083f0 100644 --- a/src/chain/chain-supervision.cc +++ b/src/chain/chain-supervision.cc @@ -76,13 +76,13 @@ void ProtoSupervision::Write(std::ostream &os, bool binary) const { void SupervisionOptions::Check() const { KALDI_ASSERT(left_tolerance >= 0 && right_tolerance >= 0 && frame_subsampling_factor > 0 && - left_tolerance + right_tolerance >= frame_subsampling_factor); + left_tolerance + right_tolerance + 1 >= frame_subsampling_factor); KALDI_ASSERT(lm_scale >= 0.0 && lm_scale < 1.0); if (!silence_phones_str.empty()) { KALDI_ASSERT(left_tolerance_silence >= 0 && right_tolerance_silence >= 0 && - left_tolerance_silence + right_tolerance_silence >= frame_subsampling_factor); + left_tolerance_silence + right_tolerance_silence + 1 >= frame_subsampling_factor); } } diff --git a/src/latbin/lattice-determinize-non-compact.cc b/src/latbin/lattice-determinize-non-compact.cc index 44ae8566f86..cf73e22980d 100644 --- a/src/latbin/lattice-determinize-non-compact.cc +++ b/src/latbin/lattice-determinize-non-compact.cc @@ -90,101 +90,6 @@ bool DeterminizeLatticeWrapper(const Lattice &lat, return false; } -void ComputeAcousticScoresMap( - const Lattice &lat, - unordered_map, std::pair, - PairHasher > *acoustic_scores) { - acoustic_scores->clear(); - - std::vector state_times; - LatticeStateTimes(lat, &state_times); - - KALDI_ASSERT(lat.Start() == 0); - - for (StateId s = 0; s < lat.NumStates(); s++) { - int32 t = state_times[s]; - for (fst::ArcIterator aiter(lat, s); !aiter.Done(); - aiter.Next()) { - const Arc &arc = aiter.Value(); - const LatticeWeight &weight = arc.weight; - - int32 tid = arc.ilabel; - - if (tid != 0) { - unordered_map, std::pair, - PairHasher >::iterator it = acoustic_scores->find(std::make_pair(t, tid)); - if (it == acoustic_scores->end()) { - acoustic_scores->insert(std::make_pair(std::make_pair(t, tid), - std::make_pair(weight.Value2(), 1))); - } else { - if (it->second.second == 2 - && it->second.first / it->second.second != weight.Value2()) { - KALDI_VLOG(2) << "Transitions on the same frame have different " - << "acoustic costs for tid " << tid << "; " - << it->second.first / it->second.second - << " vs " << weight.Value2(); - } - it->second.first += weight.Value2(); - it->second.second++; - } - } else { - // Arcs with epsilon input label (tid) must have 0 acoustic cost - KALDI_ASSERT(weight.Value2() == 0); - } - } - - LatticeWeight f = lat.Final(s); - if (f != LatticeWeight::Zero()) { - // Final acoustic cost must be 0 as we are reading from - // non-determinized, non-compact lattice - KALDI_ASSERT(f.Value2() == 0.0); - } - } -} - -void ReplaceAcousticScoresFromMap( - const unordered_map, std::pair, - PairHasher > &acoustic_scores, - Lattice *lat) { - fst::TopSort(lat); - - std::vector state_times; - LatticeStateTimes(*lat, &state_times); - - KALDI_ASSERT(lat->Start() == 0); - - for (StateId s = 0; s < lat->NumStates(); s++) { - int32 t = state_times[s]; - for (fst::MutableArcIterator aiter(lat, s); - !aiter.Done(); aiter.Next()) { - Arc arc(aiter.Value()); - - int32 tid = arc.ilabel; - if (tid != 0) { - unordered_map, std::pair, - PairHasher >::const_iterator it = acoustic_scores.find(std::make_pair(t, tid)); - if (it == acoustic_scores.end()) { - KALDI_ERR << "Could not find tid " << tid << " at time " << t - << " in the acoustic scores map."; - } else { - arc.weight.SetValue2(it->second.first / it->second.second); - } - } else { - // For epsilon arcs, set acoustic cost to 0.0 - arc.weight.SetValue2(0.0); - } - aiter.SetValue(arc); - } - - LatticeWeight f = lat->Final(s); - if (f != LatticeWeight::Zero()) { - // Set final acoustic cost to 0.0 - f.SetValue2(0.0); - lat->SetFinal(s, f); - } - } -} - } int main(int argc, char *argv[]) { From a3c3703a7e244c32b475583754711cf1b836ca9c Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Fri, 27 Oct 2017 12:39:40 -0400 Subject: [PATCH 083/174] minor changes --- .../semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_f.sh | 2 +- .../semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_g.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_f.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_f.sh index 3af0410dbe1..6c9ae27d392 100755 --- a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_f.sh +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_f.sh @@ -1,6 +1,6 @@ #!/bin/bash -# This script is same as _d, but uses a weight of 1.0 for unsupervised set. +# This script is same as _g, but split lattice supervision # unsup_frames_per_eg=150 # Deriv weights: Lattice posterior of best path pdf # Unsupervised weight: 1.0 diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_g.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_g.sh index 3679842c877..42906a95d2a 100755 --- a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_g.sh +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_g.sh @@ -1,6 +1,6 @@ #!/bin/bash -# This script is same as _d, but uses a weight of 1.0 for unsupervised set. +# This script is same as _e, but uses tree from supervised set. # unsup_frames_per_eg=150 # Deriv weights: Lattice posterior of best path pdf # Unsupervised weight: 1.0 From bf10730b84df864c0bcaf31cbda66228be11815b Mon Sep 17 00:00:00 2001 From: System User Date: Fri, 27 Oct 2017 12:40:31 -0400 Subject: [PATCH 084/174] semisup: Changes to get_egs --- egs/wsj/s5/steps/nnet3/chain/get_egs.sh | 120 +++++++++--------- egs/wsj/s5/steps/nnet3/chain/get_egs_split.sh | 107 +++++++++------- 2 files changed, 121 insertions(+), 106 deletions(-) diff --git a/egs/wsj/s5/steps/nnet3/chain/get_egs.sh b/egs/wsj/s5/steps/nnet3/chain/get_egs.sh index ba89c94f29f..20fcd5db802 100755 --- a/egs/wsj/s5/steps/nnet3/chain/get_egs.sh +++ b/egs/wsj/s5/steps/nnet3/chain/get_egs.sh @@ -55,9 +55,9 @@ left_tolerance_silence= transform_dir= # If supplied, overrides latdir as the place to find fMLLR transforms stage=0 -nj=15 # This should be set to the maximum number of jobs you are - # comfortable to run in parallel; you can increase it if your disk - # speed is greater and you have more machines. +max_jobs_run=15 # This should be set to the maximum number of jobs you are + # comfortable to run in parallel; you can increase it if your disk + # speed is greater and you have more machines. max_shuffle_jobs_run=50 # the shuffle jobs now include the nnet3-chain-normalize-egs command, # which is fairly CPU intensive, so we can run quite a few at once # without overloading the disks. @@ -78,8 +78,6 @@ phone_insertion_penalty= deriv_weights_scp= generate_egs_scp=false no_chunking=false -use_mbr_decode=false -arc_scale=1.0 lat_copy_src= echo "$0 $@" # Print the command line for logging @@ -98,7 +96,7 @@ if [ $# != 4 ]; then echo "" echo "Main options (for others, see top of script file)" echo " --config # config file containing options" - echo " --nj # The maximum number of jobs you want to run in" + echo " --max-jobs-run # The maximum number of jobs you want to run in" echo " # parallel (increase this only if you have good disk and" echo " # network speed). default=6" echo " --cmd (utils/run.pl;utils/queue.pl ) # how to run jobs." @@ -112,7 +110,7 @@ if [ $# != 4 ]; then echo " --left-context-initial # If >= 0, left-context for first chunk of an utterance" echo " --right-context-final # If >= 0, right-context for last chunk of an utterance" echo " --num-egs-diagnostic <#frames;4000> # Number of egs used in computing (train,valid) diagnostics" - echo " --num-valid-egs-combine <#frames;10000> # Number of egss used in getting combination weights at the" + echo " --num-valid-egs-combine <#frames;10000> # Number of egs used in getting combination weights at the" echo " # very end." echo " --stage # Used to run a partially-completed training process from somewhere in" echo " # the middle." @@ -136,13 +134,13 @@ for f in $data/feats.scp $latdir/lat.1.gz $latdir/final.mdl \ [ ! -f $f ] && echo "$0: no such file $f" && exit 1; done +nj=$(cat $latdir/num_jobs) || exit 1 + sdata=$data/split$nj utils/split_data.sh $data $nj mkdir -p $dir/log $dir/info -num_lat_jobs=$(cat $latdir/num_jobs) || exit 1; - # Get list of validation utterances. frame_shift=$(utils/data/get_frame_shift.sh $data) || exit 1 @@ -293,20 +291,6 @@ if [ -e $dir/storage ]; then done fi -if [ $stage -le 2 ]; then - echo "$0: copying training lattices" - - if [ -z "$lat_copy_src" ]; then - $cmd --max-jobs-run 6 JOB=1:$num_lat_jobs $dir/log/lattice_copy.JOB.log \ - lattice-copy "ark:gunzip -c $latdir/lat.JOB.gz|" ark,scp:$dir/lat.JOB.ark,$dir/lat.JOB.scp || exit 1; - - for id in $(seq $num_lat_jobs); do cat $dir/lat.$id.scp; done > $dir/lat.scp - else - ln -sf `readlink -f $lat_copy_src`/lat.*.{ark,scp} $dir/ - ln -sf `readlink -f $lat_copy_src`/lat.scp $dir/ - fi -fi - egs_opts="--left-context=$left_context --right-context=$right_context --num-frames=$frames_per_eg --frame-subsampling-factor=$frame_subsampling_factor --compress=$compress" [ $left_context_initial -ge 0 ] && egs_opts="$egs_opts --left-context-initial=$left_context_initial" [ $right_context_final -ge 0 ] && egs_opts="$egs_opts --right-context-final=$right_context_final" @@ -332,20 +316,15 @@ if [ ! -z $lattice_prune_beam ]; then fi fi -if ! $use_mbr_decode; then - if [ ! -z "$lattice_lm_scale" ]; then - chain_supervision_all_opts="$chain_supervision_all_opts --lm-scale=$lattice_lm_scale" +if [ ! -z "$lattice_lm_scale" ]; then + chain_supervision_all_opts="$chain_supervision_all_opts --lm-scale=$lattice_lm_scale" - normalization_scale=$(perl -e " - if ($lattice_lm_scale >= 1.0 || $lattice_lm_scale < 0) { - print STDERR \"Invalid --lattice-lm-scale $lattice_lm_scale\"; - exit(1); - } - print (1.0 - $lattice_lm_scale);") - fi -else - chain_supervision_all_opts="$chain_supervision_all_opts --arc-scale=$arc_scale --use-mbr-decode" - lattice_copy_cmd="$lattice_copy_cmd | lattice-scale --acoustic-scale=$acwt ark:- ark:-" + normalization_scale=$(perl -e " + if ($lattice_lm_scale >= 1.0 || $lattice_lm_scale < 0) { + print STDERR \"Invalid --lattice-lm-scale $lattice_lm_scale\"; + exit(1); + } + print (1.0 - $lattice_lm_scale);") fi [ ! -z $phone_insertion_penalty ] && \ @@ -366,14 +345,28 @@ echo $right_context > $dir/info/right_context echo $left_context_initial > $dir/info/left_context_initial echo $right_context_final > $dir/info/right_context_final -if [ $stage -le 3 ]; then - echo "$0: Getting validation and training subset examples." +if [ $stage -le 2 ]; then + if [ ! -z "$lat_copy_src" ]; then + ln -sf `readlink -f $lat_copy_src`/lat.*.{ark,scp} $dir/ + cat $lat_copy_src/lat.{?,??}.scp > $dir/lat.scp + fi + + echo "$0: Getting validation and training subset examples in background." rm $dir/.error 2>/dev/null - echo "$0: ... extracting validation and training-subset alignments." - # do the filtering just once, as lat.scp may be long. - utils/filter_scp.pl <(cat $dir/valid_uttlist $dir/train_subset_uttlist) \ - <$dir/lat.scp >$dir/lat_special.scp + ( + if [ -z "$lat_copy_src" ]; then + $cmd --max-jobs-run 6 JOB=1:$nj $dir/log/lattice_copy.JOB.log \ + lattice-copy --include="cat $dir/valid_uttlist $dir/train_subset_uttlist |" --ignore-missing \ + "ark:gunzip -c $latdir/lat.JOB.gz|" \ + ark,scp:$dir/lat_special.JOB.ark,$dir/lat_special.JOB.scp || exit 1 + + for id in $(seq $nj); do cat $dir/lat_special.$id.scp; done > $dir/lat_special.scp + else + # do the filtering just once, as lat.scp may be long. + utils/filter_scp.pl <(cat $dir/valid_uttlist $dir/train_subset_uttlist) \ + <$dir/lat.scp >$dir/lat_special.scp + fi $cmd $dir/log/create_valid_subset.log \ utils/filter_scp.pl $dir/valid_uttlist $dir/lat_special.scp \| \ @@ -382,7 +375,7 @@ if [ $stage -le 3 ]; then ark:- ark:- \| \ nnet3-chain-get-egs $ivector_opts --srand=$srand \ $egs_opts --normalization-scale=$normalization_scale $chaindir/normalization.fst \ - "$valid_feats" ark,s,cs:- "ark:$dir/valid_all.cegs" || touch $dir/.error & + "$valid_feats" ark,s,cs:- "ark:$dir/valid_all.cegs" || exit 1 & $cmd $dir/log/create_train_subset.log \ utils/filter_scp.pl $dir/train_subset_uttlist $dir/lat_special.scp \| \ lattice-align-phones --replace-output-symbols=true $latdir/final.mdl scp:- $lattice_copy_cmd \| \ @@ -390,9 +383,9 @@ if [ $stage -le 3 ]; then $chaindir/tree $chaindir/0.trans_mdl ark:- ark:- \| \ nnet3-chain-get-egs $ivector_opts --srand=$srand \ $egs_opts --normalization-scale=$normalization_scale $chaindir/normalization.fst \ - "$train_subset_feats" ark,s,cs:- "ark:$dir/train_subset_all.cegs" || touch $dir/.error & - wait; - [ -f $dir/.error ] && echo "Error detected while creating train/valid egs" && exit 1 + "$train_subset_feats" ark,s,cs:- "ark:$dir/train_subset_all.cegs" || exit 1 & + wait + sleep 5 # wait for file system to sync. echo "... Getting subsets of validation examples for diagnostics and combination." if $generate_egs_scp; then valid_diagnostic_output="ark,scp:$dir/valid_diagnostic.cegs,$dir/valid_diagnostic.scp" @@ -403,17 +396,17 @@ if [ $stage -le 3 ]; then fi $cmd $dir/log/create_valid_subset_combine.log \ nnet3-chain-subset-egs --n=$num_valid_egs_combine ark:$dir/valid_all.cegs \ - ark:$dir/valid_combine.cegs || touch $dir/.error & + ark:$dir/valid_combine.cegs || exit 1 & $cmd $dir/log/create_valid_subset_diagnostic.log \ nnet3-chain-subset-egs --n=$num_egs_diagnostic ark:$dir/valid_all.cegs \ - $valid_diagnostic_output || touch $dir/.error & + $valid_diagnostic_output || exit 1 & $cmd $dir/log/create_train_subset_combine.log \ nnet3-chain-subset-egs --n=$num_train_egs_combine ark:$dir/train_subset_all.cegs \ - ark:$dir/train_combine.cegs || touch $dir/.error & + ark:$dir/train_combine.cegs || exit 1 & $cmd $dir/log/create_train_subset_diagnostic.log \ nnet3-chain-subset-egs --n=$num_egs_diagnostic ark:$dir/train_subset_all.cegs \ - $train_diagnostic_output || touch $dir/.error & + $train_diagnostic_output || exit 1 & wait sleep 5 # wait for file system to sync. if $generate_egs_scp; then @@ -428,6 +421,7 @@ if [ $stage -le 3 ]; then [ ! -s $f ] && echo "No examples in file $f" && exit 1; done rm $dir/valid_all.cegs $dir/train_subset_all.cegs $dir/{train,valid}_combine.cegs + ) || touch $dir/.error & fi if [ $stage -le 4 ]; then @@ -448,9 +442,15 @@ if [ $stage -le 4 ]; then # there can be too many small files to deal with, because the total number of # files is the product of 'nj' by 'num_archives_intermediate', which might be # quite large. - $cmd JOB=1:$nj $dir/log/get_egs.JOB.log \ - utils/filter_scp.pl $sdata/JOB/utt2spk $dir/lat.scp \| \ - lattice-align-phones --replace-output-symbols=true $latdir/final.mdl scp:- $lattice_copy_cmd \| \ + + lattice_rspecifier="ark:gunzip -c $latdir/lat.JOB.gz |" + if [ ! -z "$lat_copy_src" ]; then + lattice_rspecifier="scp:utils/filter_scp.pl $sdata/JOB/utt2spk $dir/lat.scp |" + fi + + $cmd --max-jobs-run $max_jobs_run JOB=1:$nj $dir/log/get_egs.JOB.log \ + lattice-align-phones --replace-output-symbols=true $latdir/final.mdl \ + "$lattice_rspecifier" $lattice_copy_cmd \| \ chain-get-supervision $chain_supervision_all_opts \ --weight=$egs_weight \ $chaindir/tree $chaindir/0.trans_mdl ark:- ark:- \| \ @@ -480,7 +480,7 @@ if [ $stage -le 5 ]; then $cmd --max-jobs-run $max_shuffle_jobs_run --mem 8G JOB=1:$num_archives_intermediate $dir/log/shuffle.JOB.log \ nnet3-chain-normalize-egs --normalization-scale=$normalization_scale $chaindir/normalization.fst "ark:cat $egs_list|" ark:- \| \ nnet3-chain-shuffle-egs --srand=\$[JOB+$srand] ark:- $output_archive || exit 1; - + if $generate_egs_scp; then #concatenate cegs.JOB.scp in single cegs.scp rm -rf $dir/cegs.scp @@ -511,12 +511,12 @@ if [ $stage -le 5 ]; then nnet3-chain-normalize-egs --normalization-scale=$normalization_scale $chaindir/normalization.fst "ark:cat $egs_list|" ark:- \| \ nnet3-chain-shuffle-egs --srand=\$[JOB+$srand] ark:- ark:- \| \ nnet3-chain-copy-egs ark:- $output_archives || exit 1; - + if $generate_egs_scp; then #concatenate cegs.JOB.scp in single cegs.scp rm -rf $dir/cegs.scp for j in $(seq $num_archives_intermediate); do - for y in $(seq $num_archives_intermediate); do + for y in $(seq $archives_multiple); do cat $dir/cegs.$j.$y.scp || exit 1; done done > $dir/cegs.scp || exit 1; @@ -525,6 +525,9 @@ if [ $stage -le 5 ]; then fi fi +wait +[ -f $dir/.error ] && echo "Error detected while creating train/valid egs" && exit 1 + if [ $stage -le 6 ]; then echo "$0: removing temporary archives" ( @@ -538,8 +541,9 @@ if [ $stage -le 6 ]; then # there are some extra soft links that we should delete. for f in $dir/cegs.*.*.ark; do rm $f; done fi - echo "$0: removing temporary lattices" - rm $dir/lat.* + if [ -z "$lat_copy_src" ]; then + rm $dir/lat_special.*.ark + fi echo "$0: removing temporary alignments and transforms" # Ignore errors below because trans.* might not exist. rm $dir/{ali,trans}.{ark,scp} 2>/dev/null diff --git a/egs/wsj/s5/steps/nnet3/chain/get_egs_split.sh b/egs/wsj/s5/steps/nnet3/chain/get_egs_split.sh index e507c7467c0..786db3fe31f 100755 --- a/egs/wsj/s5/steps/nnet3/chain/get_egs_split.sh +++ b/egs/wsj/s5/steps/nnet3/chain/get_egs_split.sh @@ -55,9 +55,9 @@ left_tolerance_silence= transform_dir= # If supplied, overrides latdir as the place to find fMLLR transforms stage=0 -nj=15 # This should be set to the maximum number of jobs you are - # comfortable to run in parallel; you can increase it if your disk - # speed is greater and you have more machines. +max_jobs_run=15 # This should be set to the maximum number of jobs you are + # comfortable to run in parallel; you can increase it if your disk + # speed is greater and you have more machines. max_shuffle_jobs_run=50 # the shuffle jobs now include the nnet3-chain-normalize-egs command, # which is fairly CPU intensive, so we can run quite a few at once # without overloading the disks. @@ -95,7 +95,7 @@ if [ $# != 4 ]; then echo "" echo "Main options (for others, see top of script file)" echo " --config # config file containing options" - echo " --nj # The maximum number of jobs you want to run in" + echo " --max-jobs-run # The maximum number of jobs you want to run in" echo " # parallel (increase this only if you have good disk and" echo " # network speed). default=6" echo " --cmd (utils/run.pl;utils/queue.pl ) # how to run jobs." @@ -109,7 +109,7 @@ if [ $# != 4 ]; then echo " --left-context-initial # If >= 0, left-context for first chunk of an utterance" echo " --right-context-final # If >= 0, right-context for last chunk of an utterance" echo " --num-egs-diagnostic <#frames;4000> # Number of egs used in computing (train,valid) diagnostics" - echo " --num-valid-egs-combine <#frames;10000> # Number of egss used in getting combination weights at the" + echo " --num-valid-egs-combine <#frames;10000> # Number of egs used in getting combination weights at the" echo " # very end." echo " --stage # Used to run a partially-completed training process from somewhere in" echo " # the middle." @@ -131,13 +131,13 @@ for f in $data/feats.scp $latdir/lat.1.gz $latdir/final.mdl \ [ ! -f $f ] && echo "$0: no such file $f" && exit 1; done +nj=$(cat $latdir/num_jobs) || exit 1 + sdata=$data/split$nj utils/split_data.sh $data $nj mkdir -p $dir/log $dir/info -num_lat_jobs=$(cat $latdir/num_jobs) || exit 1; - # Get list of validation utterances. frame_shift=$(utils/data/get_frame_shift.sh $data) || exit 1 @@ -274,20 +274,6 @@ if [ -e $dir/storage ]; then done fi -if [ $stage -le 2 ]; then - echo "$0: copying training lattices" - - if [ -z "$lat_copy_src" ]; then - $cmd --max-jobs-run 6 JOB=1:$num_lat_jobs $dir/log/lattice_copy.JOB.log \ - lattice-copy --write-compact=false "ark:gunzip -c $latdir/lat.JOB.gz|" ark,scp:$dir/lat.JOB.ark,$dir/lat.JOB.scp || exit 1; - - for id in $(seq $num_lat_jobs); do cat $dir/lat.$id.scp; done > $dir/lat.scp - else - ln -sf `readlink -f $lat_copy_src`/lat.*.{ark,scp} $dir/ - ln -sf `readlink -f $lat_copy_src`/lat.scp $dir/ - fi -fi - egs_opts="--left-context=$left_context --right-context=$right_context --num-frames=$frames_per_eg --frame-subsampling-factor=$frame_subsampling_factor --compress=$compress" [ $left_context_initial -ge 0 ] && egs_opts="$egs_opts --left-context-initial=$left_context_initial" [ $right_context_final -ge 0 ] && egs_opts="$egs_opts --right-context-final=$right_context_final" @@ -314,12 +300,12 @@ fi if [ ! -z "$lattice_lm_scale" ]; then chain_supervision_all_opts="$chain_supervision_all_opts --supervision.lm-scale=$lattice_lm_scale" - + normalization_scale=$(perl -e " - if ($lattice_lm_scale > 1.0 || $lattice_lm_scale < 0) { + if ($lattice_lm_scale >= 1.0 || $lattice_lm_scale < 0) { print STDERR \"Invalid --lattice-lm-scale $lattice_lm_scale\"; - exit(1); - } + exit(1); + } print (1.0 - $lattice_lm_scale);") fi @@ -343,14 +329,29 @@ echo $right_context > $dir/info/right_context echo $left_context_initial > $dir/info/left_context_initial echo $right_context_final > $dir/info/right_context_final -if [ $stage -le 3 ]; then - echo "$0: Getting validation and training subset examples." +if [ $stage -le 2 ]; then + if [ ! -z "$lat_copy_src" ]; then + ln -sf `readlink -f $lat_copy_src`/lat.*.{ark,scp} $dir/ + cat $lat_copy_src/lat.{?,??}.scp > $dir/lat.scp + fi + + echo "$0: Getting validation and training subset examples in background." rm $dir/.error 2>/dev/null - echo "$0: ... extracting validation and training-subset alignments." - # do the filtering just once, as lat.scp may be long. - utils/filter_scp.pl <(cat $dir/valid_uttlist $dir/train_subset_uttlist) \ - <$dir/lat.scp >$dir/lat_special.scp + ( + if [ -z "$lat_copy_src" ]; then + $cmd --max-jobs-run 6 JOB=1:$nj $dir/log/lattice_copy.JOB.log \ + lattice-copy --include="cat $dir/valid_uttlist $dir/train_subset_uttlist |" --ignore-missing \ + --write-compact=false \ + "ark:gunzip -c $latdir/lat.JOB.gz|" \ + ark,scp:$dir/lat_special.JOB.ark,$dir/lat_special.JOB.scp || exit 1 + + for id in $(seq $nj); do cat $dir/lat_special.$id.scp; done > $dir/lat_special.scp + else + # do the filtering just once, as lat.scp may be long. + utils/filter_scp.pl <(cat $dir/valid_uttlist $dir/train_subset_uttlist) \ + <$dir/lat.scp >$dir/lat_special.scp + fi $cmd $dir/log/create_valid_subset.log \ utils/filter_scp.pl $dir/valid_uttlist $dir/lat_special.scp \| \ @@ -358,16 +359,16 @@ if [ $stage -le 3 ]; then nnet3-chain-split-and-get-egs $chain_supervision_all_opts $ivector_opts --srand=$srand \ $egs_opts $chaindir/normalization.fst \ "$valid_feats" $chaindir/tree $chaindir/0.trans_mdl \ - ark,s,cs:- "ark:$dir/valid_all.cegs" || touch $dir/.error & + ark,s,cs:- "ark:$dir/valid_all.cegs" || exit 1 & $cmd $dir/log/create_train_subset.log \ utils/filter_scp.pl $dir/train_subset_uttlist $dir/lat_special.scp \| \ - lattice-align-phones --write-compact=false --replace-output-symbols=true $latdir/final.mdl scp:- ark:- \| \ + lattice-align-phones --write-compact=false --replace-output-symbols=true $latdir/final.mdl scp:- $lattice_copy_cmd \| \ nnet3-chain-split-and-get-egs $chain_supervision_all_opts $ivector_opts --srand=$srand \ $egs_opts $chaindir/normalization.fst \ "$train_subset_feats" $chaindir/tree $chaindir/0.trans_mdl \ - ark,s,cs:- "ark:$dir/train_subset_all.cegs" || touch $dir/.error & - wait; - [ -f $dir/.error ] && echo "Error detected while creating train/valid egs" && exit 1 + ark,s,cs:- "ark:$dir/train_subset_all.cegs" || exit 1 & + wait + sleep 5 # wait for file system to sync. echo "... Getting subsets of validation examples for diagnostics and combination." if $generate_egs_scp; then valid_diagnostic_output="ark,scp:$dir/valid_diagnostic.cegs,$dir/valid_diagnostic.scp" @@ -378,17 +379,17 @@ if [ $stage -le 3 ]; then fi $cmd $dir/log/create_valid_subset_combine.log \ nnet3-chain-subset-egs --n=$num_valid_egs_combine ark:$dir/valid_all.cegs \ - ark:$dir/valid_combine.cegs || touch $dir/.error & + ark:$dir/valid_combine.cegs || exit 1 & $cmd $dir/log/create_valid_subset_diagnostic.log \ nnet3-chain-subset-egs --n=$num_egs_diagnostic ark:$dir/valid_all.cegs \ - $valid_diagnostic_output || touch $dir/.error & + $valid_diagnostic_output || exit 1 & $cmd $dir/log/create_train_subset_combine.log \ nnet3-chain-subset-egs --n=$num_train_egs_combine ark:$dir/train_subset_all.cegs \ - ark:$dir/train_combine.cegs || touch $dir/.error & + ark:$dir/train_combine.cegs || exit 1 & $cmd $dir/log/create_train_subset_diagnostic.log \ nnet3-chain-subset-egs --n=$num_egs_diagnostic ark:$dir/train_subset_all.cegs \ - $train_diagnostic_output || touch $dir/.error & + $train_diagnostic_output || exit 1 & wait sleep 5 # wait for file system to sync. if $generate_egs_scp; then @@ -403,6 +404,7 @@ if [ $stage -le 3 ]; then [ ! -s $f ] && echo "No examples in file $f" && exit 1; done rm $dir/valid_all.cegs $dir/train_subset_all.cegs $dir/{train,valid}_combine.cegs + ) || touch $dir/.error & fi if [ $stage -le 4 ]; then @@ -423,9 +425,15 @@ if [ $stage -le 4 ]; then # there can be too many small files to deal with, because the total number of # files is the product of 'nj' by 'num_archives_intermediate', which might be # quite large. - $cmd JOB=1:$nj $dir/log/get_egs.JOB.log \ - utils/filter_scp.pl $sdata/JOB/utt2spk $dir/lat.scp \| \ - lattice-align-phones --write-compact=false --replace-output-symbols=true $latdir/final.mdl scp:- ark:- \| \ + + lattice_rspecifier="ark:gunzip -c $latdir/lat.JOB.gz |" + if [ ! -z "$lat_copy_src" ]; then + lattice_rspecifier="scp:utils/filter_scp.pl $sdata/JOB/utt2spk $dir/lat.scp |" + fi + + $cmd --max-jobs-run $max_jobs_run JOB=1:$nj $dir/log/get_egs.JOB.log \ + lattice-align-phones --write-compact=false --replace-output-symbols=true $latdir/final.mdl \ + "$lattice_rspecifier" $lattice_copy_cmd \| \ nnet3-chain-split-and-get-egs $chain_supervision_all_opts $ivector_opts --srand=\$[JOB+$srand] $egs_opts --supervision.weight=$egs_weight \ --num-frames-overlap=$frames_overlap_per_eg \ "$feats" $chaindir/tree $chaindir/0.trans_mdl \ @@ -453,7 +461,7 @@ if [ $stage -le 5 ]; then $cmd --max-jobs-run $max_shuffle_jobs_run --mem 8G JOB=1:$num_archives_intermediate $dir/log/shuffle.JOB.log \ nnet3-chain-normalize-egs --normalization-scale=$normalization_scale $chaindir/normalization.fst "ark:cat $egs_list|" ark:- \| \ nnet3-chain-shuffle-egs --srand=\$[JOB+$srand] ark:- $output_archive || exit 1; - + if $generate_egs_scp; then #concatenate cegs.JOB.scp in single cegs.scp rm -rf $dir/cegs.scp @@ -484,7 +492,7 @@ if [ $stage -le 5 ]; then nnet3-chain-normalize-egs --normalization-scale=$normalization_scale $chaindir/normalization.fst "ark:cat $egs_list|" ark:- \| \ nnet3-chain-shuffle-egs --srand=\$[JOB+$srand] ark:- ark:- \| \ nnet3-chain-copy-egs ark:- $output_archives || exit 1; - + if $generate_egs_scp; then #concatenate cegs.JOB.scp in single cegs.scp rm -rf $dir/cegs.scp @@ -498,6 +506,9 @@ if [ $stage -le 5 ]; then fi fi +wait +[ -f $dir/.error ] && echo "Error detected while creating train/valid egs" && exit 1 + if [ $stage -le 6 ]; then echo "$0: removing temporary archives" ( @@ -511,8 +522,9 @@ if [ $stage -le 6 ]; then # there are some extra soft links that we should delete. for f in $dir/cegs.*.*.ark; do rm $f; done fi - echo "$0: removing temporary lattices" - rm $dir/lat.* + if [ -z "$lat_copy_src" ]; then + rm $dir/lat_special.*.ark + fi echo "$0: removing temporary alignments and transforms" # Ignore errors below because trans.* might not exist. rm $dir/{ali,trans}.{ark,scp} 2>/dev/null @@ -520,4 +532,3 @@ if [ $stage -le 6 ]; then fi echo "$0: Finished preparing training examples" - From 0bbd2ce71035de2872b2ec9af7813e0dd349838d Mon Sep 17 00:00:00 2001 From: System User Date: Sun, 29 Oct 2017 12:20:53 -0400 Subject: [PATCH 085/174] semisup: Adding 100k experiments --- .../semisup/chain/tuning/run_tdnn_100k.sh | 194 ++++++++ .../chain/tuning/run_tdnn_15k_best_path_c.sh | 453 ++++++++++++++++++ .../s5/local/semisup/run_100k.sh | 89 ++++ egs/wsj/s5/steps/best_path_weights.sh | 6 + 4 files changed, 742 insertions(+) create mode 100644 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k.sh create mode 100755 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_best_path_c.sh create mode 100644 egs/fisher_english/s5/local/semisup/run_100k.sh diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k.sh new file mode 100644 index 00000000000..904ed588bd3 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k.sh @@ -0,0 +1,194 @@ +#!/bin/bash +set -e + +# This is fisher chain recipe for training a model on a subset of around 10 hours. + +# configs for 'chain' +stage=0 +tdnn_affix=7b +train_stage=-10 +get_egs_stage=-10 +decode_iter= +train_set=train_sup +ivector_train_set=train_sup +tree_affix= +nnet3_affix= +chain_affix= +exp=exp/semisup_100k +gmm=tri4a +xent_regularize=0.1 +hidden_dim=725 + +# training options +num_epochs=4 +remove_egs=false +common_egs_dir= +minibatch_size=128 + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate -1 \ + --cmd "$train_cmd" 4000 data/${train_set} $lang $gmm_dir $treedir || exit 1 +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn5 dim=$hidden_dim target-rms=0.5 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn5 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$common_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize 0.1 \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +decode_suff= +if [ $stage -le 15 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}${decode_suff} || exit 1; + ) & + done +fi +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_best_path_c.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_best_path_c.sh new file mode 100755 index 00000000000..e33331cc428 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_best_path_c.sh @@ -0,0 +1,453 @@ +#!/bin/bash + +# This script is same as _d, but uses a weight of 1.0 for unsupervised set. +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 5,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup15k_250k # for reference +exp=exp/semisup_15k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup15k +semi_affix=semi15k_250k # affix relating train-set splitting proportion + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=best_path_comb1c # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix= +unsup_egs_opts= +apply_deriv_weights=true + + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size="150=128/300=64" +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_tol${tolerance} +tree_affix= + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor + + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri3_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/1best_lats_${unsupervised_set}${decode_affix} + +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${lang_test_suffix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/run_100k.sh b/egs/fisher_english/s5/local/semisup/run_100k.sh new file mode 100644 index 00000000000..4a023b627dc --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/run_100k.sh @@ -0,0 +1,89 @@ +#!/bin/bash + +# Copyright 2017 Vimal Manohar +# Apache 2.0 + +. cmd.sh +. path.sh + +stage=-1 +train_stage=-10 + +. utils/parse_options.sh + +set -o pipefail +exp=exp/semisup_100k + +true && { +utils/subset_data_dir.sh --shortest data/train_sup 100000 data/train_sup_100kshort +utils/subset_data_dir.sh data/train_sup_100kshort 10000 data/train_sup_10k +utils/data/remove_dup_utts.sh 100 data/train_sup_10k data/train_sup_10k_nodup +utils/subset_data_dir.sh --speakers data/train_sup 30000 data/train_sup_30k + +steps/train_mono.sh --nj 10 --cmd "$train_cmd" \ + data/train_sup_10k_nodup data/lang $exp/mono0a || exit 1 + +steps/align_si.sh --nj 30 --cmd "$train_cmd" \ + data/train_sup_30k data/lang $exp/mono0a $exp/mono0a_ali || exit 1 + +steps/train_deltas.sh --cmd "$train_cmd" \ + 2500 20000 data/train_sup_30k data/lang $exp/mono0a_ali $exp/tri1 || exit 1 + +(utils/mkgraph.sh data/lang_test $exp/tri1 $exp/tri1/graph + steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ + $exp/tri1/graph data/dev $exp/tri1/decode_dev)& + +steps/align_si.sh --nj 30 --cmd "$train_cmd" \ + data/train_sup_30k data/lang $exp/tri1 $exp/tri1_ali || exit 1; + +steps/train_lda_mllt.sh --cmd "$train_cmd" \ + 2500 20000 data/train_sup_30k data/lang $exp/tri1_ali $exp/tri2 || exit 1; + +(utils/mkgraph.sh data/lang_test $exp/tri2 $exp/tri2/graph + steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ + $exp/tri2/graph data/dev $exp/tri2/decode_dev)& + +steps/align_si.sh --nj 30 --cmd "$train_cmd" \ + data/train_sup data/lang $exp/tri2 $exp/tri2_ali || exit 1; + +steps/train_lda_mllt.sh --cmd "$train_cmd" \ + --splice-opts "--left-context=3 --right-context=3" \ + 5000 40000 data/train_sup data/lang $exp/tri2_ali $exp/tri3a || exit 1; + +( + utils/mkgraph.sh data/lang_test $exp/tri3a $exp/tri3a/graph || exit 1; + steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ + $exp/tri3a/graph data/dev $exp/tri3a/decode_dev || exit 1; +)& + +steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \ + data/train_sup data/lang $exp/tri3a $exp/tri3a_ali || exit 1; + +steps/train_sat.sh --cmd "$train_cmd" \ + 5000 100000 data/train_sup data/lang $exp/tri3a_ali $exp/tri4a || exit 1; + +( + utils/mkgraph.sh data/lang_test $exp/tri4a $exp/tri4a/graph + steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ + $exp/tri4a/graph data/dev $exp/tri4a/decode_dev +)& + +utils/copy_data_dir.sh data/train_unsup250k data/train_unsup100k_250k +utils/combine_data.sh data/semisup100k_250k data/train_sup \ + data/train_unsup250k || exit 1 + +local/semisup/chain/tuning/run_tdnn_100k.sh \ + --train-set train_sup \ + --stage $stage --train-stage $train_stage \ + --exp $exp \ + --ivector-train-set train_sup || exit 1 +} + +false && local/semisup/chain/tuning/run_tdnn_oracle.sh \ + --train-set semisup15k_250k \ + --nnet3-affix _semi15k_250k \ + --chain-affix _semi15k_250k_oracle \ + --stage 9 --train-stage $train_stage \ + --exp $exp \ + --ivector-train-set semisup15k_250k || exit 1 + diff --git a/egs/wsj/s5/steps/best_path_weights.sh b/egs/wsj/s5/steps/best_path_weights.sh index b70acae0362..8e10298a4f3 100755 --- a/egs/wsj/s5/steps/best_path_weights.sh +++ b/egs/wsj/s5/steps/best_path_weights.sh @@ -164,4 +164,10 @@ for n in `seq 1 $[num_sys-1]`; do rm $dir/weights.$n.*.ark $dir/weights.$n.*.scp done +if $write_words; then + for n in `seq $nj`; do + cat $dir/words.$n.txt + done > $dir/words.txt +fi + exit 0 From f392d741b48b4b6e06198ce17a8fb04601654cbc Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Mon, 30 Oct 2017 12:26:16 -0400 Subject: [PATCH 086/174] Changed permissions --- egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k.sh | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k.sh diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k.sh old mode 100644 new mode 100755 From 05ba2d9ceca883de7be8f765a0907268f0ad221e Mon Sep 17 00:00:00 2001 From: System User Date: Thu, 2 Nov 2017 15:21:26 -0400 Subject: [PATCH 087/174] Binaries for undeterminized lattices --- src/chain/chain-supervision.cc | 8 +- src/latbin/Makefile | 2 +- src/latbin/lattice-interp.cc | 32 ++++++- src/latbin/lattice-lmrescore-const-arpa.cc | 89 ++++++++++++++++--- src/latbin/lattice-lmrescore.cc | 99 ++++++++++++++++------ src/latbin/lattice-word-align.cc | 2 +- 6 files changed, 188 insertions(+), 44 deletions(-) diff --git a/src/chain/chain-supervision.cc b/src/chain/chain-supervision.cc index 4f52f1083f0..6eab1059707 100644 --- a/src/chain/chain-supervision.cc +++ b/src/chain/chain-supervision.cc @@ -847,8 +847,10 @@ bool AddWeightToSupervisionFst(const fst::StdVectorFst &normalization_fst, fst::StdVectorFst supervision_fst_noeps(supervision->fst); fst::RmEpsilon(&supervision_fst_noeps); if (!TryDeterminizeMinimize(kSupervisionMaxStates, - &supervision_fst_noeps)) + &supervision_fst_noeps)) { + KALDI_WARN << "Failed to determinize supervision fst"; return false; + } // note: by default, 'Compose' will call 'Connect', so if the // resulting FST is not connected, it will end up empty. @@ -861,8 +863,10 @@ bool AddWeightToSupervisionFst(const fst::StdVectorFst &normalization_fst, // determinize and minimize to make it as compact as possible. if (!TryDeterminizeMinimize(kSupervisionMaxStates, - &composed_fst)) + &composed_fst)) { + KALDI_WARN << "Failed to determinize normalized supervision fst"; return false; + } supervision->fst = composed_fst; // Make sure the states are numbered in increasing order of time. diff --git a/src/latbin/Makefile b/src/latbin/Makefile index fb9b3f5e71d..1560907c103 100644 --- a/src/latbin/Makefile +++ b/src/latbin/Makefile @@ -9,7 +9,7 @@ BINFILES = lattice-best-path lattice-prune lattice-equivalent lattice-to-nbest \ lattice-determinize lattice-oracle lattice-rmali \ lattice-compose lattice-boost-ali lattice-copy lattice-to-fst \ lattice-to-phone-lattice lattice-interp lattice-project \ - lattice-add-trans-probs lattice-difference lattice-word-align \ + lattice-add-trans-probs lattice-difference \ nbest-to-linear nbest-to-lattice lattice-1best linear-to-nbest \ lattice-mbr-decode lattice-align-words lattice-to-mpe-post \ lattice-copy-backoff nbest-to-ctm lattice-determinize-pruned \ diff --git a/src/latbin/lattice-interp.cc b/src/latbin/lattice-interp.cc index 41e1b32658f..edb5d02be7e 100644 --- a/src/latbin/lattice-interp.cc +++ b/src/latbin/lattice-interp.cc @@ -22,6 +22,7 @@ #include "util/common-utils.h" #include "fstext/fstext-lib.h" #include "lat/kaldi-lattice.h" +#include "lat/lattice-functions.h" int main(int argc, char *argv[]) { try { @@ -45,8 +46,10 @@ int main(int argc, char *argv[]) { " e.g.: lattice-compose ark:1.lats ark:2.lats ark:composed.lats\n"; ParseOptions po(usage); + bool write_compact = true; BaseFloat alpha = 0.5; // Scale of 1st in the pair. + po.Register("write-compact", &write_compact, "If true, write in normal (compact) form."); po.Register("alpha", &alpha, "Scale of the first lattice in the pair (should be in range [0, 1])"); po.Read(argc, argv); @@ -62,7 +65,13 @@ int main(int argc, char *argv[]) { SequentialLatticeReader lattice_reader1(lats_rspecifier1); RandomAccessCompactLatticeReader lattice_reader2(lats_rspecifier2); - CompactLatticeWriter compact_lattice_writer(lats_wspecifier); + CompactLatticeWriter compact_lattice_writer; + LatticeWriter lattice_writer; + + if (write_compact) + compact_lattice_writer.Open(lats_wspecifier); + else + lattice_writer.Open(lats_wspecifier); int32 n_processed = 0, n_empty = 0, n_success = 0, n_no_2ndlat=0; @@ -70,9 +79,16 @@ int main(int argc, char *argv[]) { std::string key = lattice_reader1.Key(); Lattice lat1 = lattice_reader1.Value(); lattice_reader1.FreeCurrent(); + // Compute a map from each (t, tid) to (sum_of_acoustic_scores, count) + unordered_map, std::pair, + PairHasher > acoustic_scores; + if (!write_compact) + ComputeAcousticScoresMap(lat1, &acoustic_scores); ScaleLattice(fst::LatticeScale(alpha, alpha), &lat1); + ArcSort(&lat1, fst::OLabelCompare()); + if (lattice_reader2.HasKey(key)) { n_processed++; CompactLattice clat2 = lattice_reader2.Value(key); @@ -91,9 +107,17 @@ int main(int argc, char *argv[]) { n_empty++; } else { n_success++; - CompactLattice clat3; - ConvertLattice(lat3, &clat3); - compact_lattice_writer.Write(key, clat3); + // Replace each arc (t, tid) with the averaged acoustic score from + // the computed map + + if (write_compact) { + CompactLattice clat3; + ConvertLattice(lat3, &clat3); + compact_lattice_writer.Write(key, clat3); + } else { + ReplaceAcousticScoresFromMap(acoustic_scores, &lat3); + lattice_writer.Write(key, lat3); + } } } else { KALDI_WARN << "No lattice found for utterance " << key << " in " diff --git a/src/latbin/lattice-lmrescore-const-arpa.cc b/src/latbin/lattice-lmrescore-const-arpa.cc index bd5f9c16cf7..06da0ba9027 100644 --- a/src/latbin/lattice-lmrescore-const-arpa.cc +++ b/src/latbin/lattice-lmrescore-const-arpa.cc @@ -44,10 +44,13 @@ int main(int argc, char *argv[]) { " const_arpa ark:out.lats\n"; ParseOptions po(usage); - bool write_compact = true; + bool write_compact = true, determinize = true; BaseFloat lm_scale = 1.0; po.Register("write-compact", &write_compact, "If true, write in normal (compact) form."); + po.Register("determinize", &determinize, "If false, then the output will retain " + "all the non-deterministic paths in the original lattice, " + "but rescored with LM score from the best path in the LM"); po.Register("lm-scale", &lm_scale, "Scaling factor for language model " "costs; frequently 1.0 or -1.0"); @@ -116,6 +119,16 @@ int main(int argc, char *argv[]) { fst::ScaleLattice(fst::GraphLatticeScale(1.0/lm_scale), &clat); ArcSort(&clat, fst::OLabelCompare()); + // A copy of the lattice is needed if determinize=false + CompactLattice clat_copy; + if (!determinize) { + clat_copy = clat; + // Remove graph scores from the lattice so that the composed lattice + // will not have any graph scores other than the one from the + // new LM. They will be added back later. + fst::ScaleLattice(fst::GraphLatticeScale(0.0), &clat); + } + // Wraps the ConstArpaLm format language model into FST. We re-create it // for each lattice to prevent memory usage increasing with time. ConstArpaLmDeterministicFst const_arpa_fst(const_arpa); @@ -131,30 +144,82 @@ int main(int argc, char *argv[]) { Invert(&composed_lat); CompactLattice determinized_clat; DeterminizeLattice(composed_lat, &determinized_clat); - fst::ScaleLattice(fst::GraphLatticeScale(lm_scale), &determinized_clat); if (determinized_clat.Start() == fst::kNoStateId) { KALDI_WARN << "Empty lattice for utterance " << key << " (incompatible LM?)"; n_fail++; } else { - if (write_compact) { - compact_lattice_writer.Write(key, determinized_clat); - } else { + if (!determinize) { + RemoveAlignmentsFromCompactLattice(&determinized_clat); + + Lattice lat2; + ConvertLattice(determinized_clat, &lat2); + fst::Project(&lat2, fst::PROJECT_OUTPUT); // project on words + fst::ArcSort(&lat2, fst::ILabelCompare()); + + // Avoid double counting of acoustic scores + fst::ScaleLattice(fst::AcousticLatticeScale(0.0), &lat2); + + Lattice lat1; + ConvertLattice(clat_copy, &lat1); + fst::ArcSort(&lat1, fst::OLabelCompare()); + Lattice out_lat; - fst::ConvertLattice(determinized_clat, &out_lat); - // Replace each arc (t, tid) with the averaged acoustic score from - // the computed map - ReplaceAcousticScoresFromMap(acoustic_scores, &out_lat); - lattice_writer.Write(key, out_lat); + // out_lat will have the original acoustic and graph scores + // (after scaling by lm_scale below) along with LM scores + // from lat2 + Compose(lat1, lat2, &out_lat); + + fst::ScaleLattice(fst::GraphLatticeScale(lm_scale), &out_lat); + + if (out_lat.Start() == fst::kNoStateId) { // empty composition. + KALDI_WARN << "For utterance " << key << ", composed result is empty."; + n_fail++; + } else { + if (write_compact) { + CompactLattice out_clat; + ConvertLattice(out_lat, &out_clat); + compact_lattice_writer.Write(key, out_clat); + } else { + // Replace each arc (t, tid) with the averaged acoustic score from + // the computed map + ReplaceAcousticScoresFromMap(acoustic_scores, &out_lat); + lattice_writer.Write(key, out_lat); + } + n_done++; + } + } else { + fst::ScaleLattice(fst::GraphLatticeScale(lm_scale), &determinized_clat); + if (write_compact) { + compact_lattice_writer.Write(key, determinized_clat); + } else { + Lattice out_lat; + fst::ConvertLattice(determinized_clat, &out_lat); + + // Replace each arc (t, tid) with the averaged acoustic score from + // the computed map + ReplaceAcousticScoresFromMap(acoustic_scores, &out_lat); + lattice_writer.Write(key, out_lat); + } + n_done++; } - n_done++; } } else { // Zero scale so nothing to do. n_done++; - compact_lattice_writer.Write(key, clat); + if (write_compact) { + compact_lattice_writer.Write(key, clat); + } else { + Lattice out_lat; + fst::ConvertLattice(clat, &out_lat); + + // Replace each arc (t, tid) with the averaged acoustic score from + // the computed map + ReplaceAcousticScoresFromMap(acoustic_scores, &out_lat); + lattice_writer.Write(key, out_lat); + } } } diff --git a/src/latbin/lattice-lmrescore.cc b/src/latbin/lattice-lmrescore.cc index 03395e68afb..ba7c84dd13c 100644 --- a/src/latbin/lattice-lmrescore.cc +++ b/src/latbin/lattice-lmrescore.cc @@ -1,4 +1,4 @@ -// latbin/lattice-lmrescore.cc +//latbin/lattice-lmrescore.cc // Copyright 2009-2011 Microsoft Corporation // 2014 Johns Hopkins University (author: Daniel Povey) @@ -44,11 +44,14 @@ int main(int argc, char *argv[]) { " e.g.: lattice-lmrescore --lm-scale=-1.0 ark:in.lats 'fstproject --project_output=true data/lang/G.fst|' ark:out.lats\n"; ParseOptions po(usage); - bool write_compact = true; + bool write_compact = true, determinize = true; BaseFloat lm_scale = 1.0; int32 num_states_cache = 50000; po.Register("write-compact", &write_compact, "If true, write in normal (compact) form."); + po.Register("determinize", &determinize, "If false, then the output will retain " + "all the non-deterministic paths in the original lattice, " + "but rescored with LM score from the best path in the LM"); po.Register("lm-scale", &lm_scale, "Scaling factor for language model costs; frequently 1.0 or -1.0"); po.Register("num-states-cache", &num_states_cache, "Number of states we cache when mapping LM FST to lattice type. " @@ -102,7 +105,6 @@ int main(int argc, char *argv[]) { // composition and determinization. SequentialLatticeReader lattice_reader(lats_rspecifier); - // Write as compact lattice. CompactLatticeWriter compact_lattice_writer; LatticeWriter lattice_writer; @@ -117,6 +119,13 @@ int main(int argc, char *argv[]) { std::string key = lattice_reader.Key(); Lattice lat = lattice_reader.Value(); lattice_reader.FreeCurrent(); + + // Compute a map from each (t, tid) to (sum_of_acoustic_scores, count) + unordered_map, std::pair, + PairHasher > acoustic_scores; + if (!write_compact) + ComputeAcousticScoresMap(lat, &acoustic_scores); + if (lm_scale != 0.0) { // Only need to modify it if LM scale nonzero. // Before composing with the LM FST, we scale the lattice weights @@ -125,14 +134,18 @@ int main(int argc, char *argv[]) { // right effect (taking the "best path" through the LM) regardless // of the sign of lm_scale. fst::ScaleLattice(fst::GraphLatticeScale(1.0 / lm_scale), &lat); - // Compute a map from each (t, tid) to (sum_of_acoustic_scores, count) - unordered_map, std::pair, - PairHasher > acoustic_scores; - if (!write_compact) - ComputeAcousticScoresMap(lat, &acoustic_scores); - ArcSort(&lat, fst::OLabelCompare()); + // A copy of the lattice is needed if determinize=false + Lattice lat_copy; + if (!determinize) { + lat_copy = lat; + // Remove graph scores from the lattice so that the composed lattice + // will not have any graph scores other than the one from the + // new LM. They will be added back later. + fst::ScaleLattice(fst::GraphLatticeScale(0.0), &lat); + } + Lattice composed_lat; // Could just do, more simply: Compose(lat, lm_fst, &composed_lat); // and not have lm_compose_cache at all. @@ -142,28 +155,66 @@ int main(int argc, char *argv[]) { Invert(&composed_lat); // make it so word labels are on the input. - CompactLattice determinized_lat; - DeterminizeLattice(composed_lat, &determinized_lat); - fst::ScaleLattice(fst::GraphLatticeScale(lm_scale), &determinized_lat); - if (determinized_lat.Start() == fst::kNoStateId) { - KALDI_WARN << "Empty lattice for utterance " << key << " (incompatible LM?)"; + CompactLattice determinized_clat; + DeterminizeLattice(composed_lat, &determinized_clat); + if (determinized_clat.Start() == fst::kNoStateId) { + KALDI_WARN << "Empty lattice for utterance " << key + << " (incompatible LM?)"; n_fail++; } else { - if (write_compact) { - compact_lattice_writer.Write(key, determinized_lat); - } else { + if (!determinize) { + RemoveAlignmentsFromCompactLattice(&determinized_clat); + Lattice lat2; + ConvertLattice(determinized_clat, &lat2); + fst::Project(&lat2, fst::PROJECT_OUTPUT); // project on words + fst::ArcSort(&lat2, fst::ILabelCompare()); + + // Avoid double counting of acoustic scores + fst::ScaleLattice(fst::AcousticLatticeScale(0.0), &lat2); + Lattice out_lat; - fst::ConvertLattice(determinized_lat, &out_lat); - // Replace each arc (t, tid) with the averaged acoustic score from - // the computed map - ReplaceAcousticScoresFromMap(acoustic_scores, &out_lat); - lattice_writer.Write(key, out_lat); + // out_lat will have the original acoustic and graph scores + // (after scaling by lm_scale below) along with LM scores + // added from lat2 + Compose(lat_copy, lat2, &out_lat); + + fst::ScaleLattice(fst::GraphLatticeScale(lm_scale), &out_lat); + + if (out_lat.Start() == fst::kNoStateId) { // empty composition. + KALDI_WARN << "For utterance " << key << ", composed result is empty."; + n_fail++; + } else { + if (write_compact) { + CompactLattice out_clat; + ConvertLattice(out_lat, &out_clat); + compact_lattice_writer.Write(key, out_clat); + } else { + // Replace each arc (t, tid) with the averaged acoustic score from + // the computed map + ReplaceAcousticScoresFromMap(acoustic_scores, &out_lat); + lattice_writer.Write(key, out_lat); + } + n_done++; + } + } else { + fst::ScaleLattice(fst::GraphLatticeScale(lm_scale), &determinized_clat); + if (write_compact) { + compact_lattice_writer.Write(key, determinized_clat); + } else { + Lattice out_lat; + fst::ConvertLattice(determinized_clat, &out_lat); + + // Replace each arc (t, tid) with the averaged acoustic score from + // the computed map + ReplaceAcousticScoresFromMap(acoustic_scores, &out_lat); + lattice_writer.Write(key, out_lat); + } + n_done++; } - n_done++; } } else { - // zero scale so nothing to do. + // Zero scale so nothing to do. n_done++; if (write_compact) { diff --git a/src/latbin/lattice-word-align.cc b/src/latbin/lattice-word-align.cc index 703202454a8..89a3c769772 100644 --- a/src/latbin/lattice-word-align.cc +++ b/src/latbin/lattice-word-align.cc @@ -51,7 +51,7 @@ int main(int argc, char *argv[]) { "even if there was an error (e.g. caused by forced-out lattice)"); po.Register("test", &test, "If true, activate checks designed to test the code."); - WordBoundaryInfoOpts opts; + WordBoundaryInfoNewOpts opts; opts.Register(&po); From fcefeaafdfe097c9ba7e6a9687197c15f24c78d8 Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Thu, 2 Nov 2017 17:14:48 -0400 Subject: [PATCH 088/174] semisup: Adding tfrnnlm scripts --- .../s5/local/tfrnnlm/rnnlm_data_prep.sh | 87 +++++++++++++++++++ .../s5/local/tfrnnlm/run_lstm_fast.sh | 58 +++++++++++++ .../s5/local/tfrnnlm/run_vanilla_rnnlm.sh | 57 ++++++++++++ 3 files changed, 202 insertions(+) create mode 100755 egs/fisher_english/s5/local/tfrnnlm/rnnlm_data_prep.sh create mode 100755 egs/fisher_english/s5/local/tfrnnlm/run_lstm_fast.sh create mode 100755 egs/fisher_english/s5/local/tfrnnlm/run_vanilla_rnnlm.sh diff --git a/egs/fisher_english/s5/local/tfrnnlm/rnnlm_data_prep.sh b/egs/fisher_english/s5/local/tfrnnlm/rnnlm_data_prep.sh new file mode 100755 index 00000000000..8d2a36a94c8 --- /dev/null +++ b/egs/fisher_english/s5/local/tfrnnlm/rnnlm_data_prep.sh @@ -0,0 +1,87 @@ +#!/bin/bash + +# This script prepares the data directory used for TensorFlow based RNNLM traiing +# it prepares the following files in the output-directory +# 1. $dir/wordlist.rnn.final : wordlist for RNNLM +# format of this file is like the following: +# 0 The +# 1 a +# 2 is +# .... +# note that we don't reserve the 0 id for any special symbol +# 2. $dir/{train/valid} : the text files, with each sentence in a line + +# 3. $dir/unk.probs : this file provides information for distributing OOS probs +# among all the OOS words, in rnnlm-rescoring. If provided, the +# probability for would be porportionally distributed among all OOS words +# +# It is called unk.probs to be consistent with rnnlm-rescoring scripts with +# Mikolov's and Yandex's toolkits, but you could simply provide the count instead, as +# the binary would auto-normalize the counts into probabilities +# the format of this file is like the following: +# some-rare-word-1 0.0003 +# some-rare-word-2 0.0004 +# ... + +set -e + +train_text=data/train/text +nwords=9999 +heldout_sent=10000 + +. path.sh +. cmd.sh + +. utils/parse_options.sh + +if [ $# != 1 ]; then + echo "Usage: $0 " + echo "For details of what the script does, see top of script file" + exit 1; +fi + +dir=$1 +srcdir=data/local/dict + +mkdir -p $dir + +cat $srcdir/lexicon.txt | awk '{print $1}' | sort -u | grep -v -w '!SIL' > $dir/wordlist.all + +# Get training data with OOV words (w.r.t. our current vocab) replaced with , +# as well as adding symbols at the end of each sentence +cat $train_text | awk -v w=$dir/wordlist.all \ + 'BEGIN{while((getline0) v[$1]=1;} + {for (i=2;i<=NF;i++) if ($i in v) printf $i" ";else printf " ";print ""}' | sed 's=$= =g' \ + | utils/shuffle_list.pl | gzip -c > $dir/all.gz + +echo "Splitting data into train and validation sets." + +gunzip -c $dir/all.gz | head -n $heldout_sent > $dir/valid.in # validation data +gunzip -c $dir/all.gz | tail -n +$heldout_sent > $dir/train.in # training data + + +cat $dir/train.in $dir/wordlist.all | \ + awk '{ for(x=1;x<=NF;x++) count[$x]++; } END{for(w in count){print count[w], w;}}' | \ + sort -nr > $dir/unigram.counts + +total_nwords=`wc -l <$dir/unigram.counts` + +# the wordlist.rnn file is just a wordlist - i.e. with a word on each lien +# wordlist.rnn.id has [word-id] [word] on each line, with [word-id] being consecutive integers +# this will not be the final wordlist we use because we need to add symbol +head -$nwords $dir/unigram.counts | awk '{print $2}' | tee $dir/wordlist.rnn | awk '{print NR-1, $1}' > $dir/wordlist.rnn.id +tail -n +$nwords $dir/unigram.counts > $dir/unk_class.counts + +for type in train valid; do + # replacing every word that does not appear in the worlist.rnn file with a symbol + cat $dir/$type.in | awk -v w=$dir/wordlist.rnn 'BEGIN{while((getline0)d[$1]=1}{for(i=1;i<=NF;i++){if(d[$i]==1){s=$i}else{s=""} printf("%s ",s)} print""}' > $dir/$type +done + +cat $dir/unk_class.counts | awk '{print $2, $1}' > $dir/unk.probs +cp $dir/wordlist.rnn $dir/wordlist.rnn.final + +if ! grep -w '' $dir/wordlist.rnn.final >/dev/null; then + echo "" >> $dir/wordlist.rnn.final +fi + +echo "data preparation finished" diff --git a/egs/fisher_english/s5/local/tfrnnlm/run_lstm_fast.sh b/egs/fisher_english/s5/local/tfrnnlm/run_lstm_fast.sh new file mode 100755 index 00000000000..6328bfd11dc --- /dev/null +++ b/egs/fisher_english/s5/local/tfrnnlm/run_lstm_fast.sh @@ -0,0 +1,58 @@ +#!/bin/bash +ngram_order=3 # this option when used, the rescoring binary makes an approximation + # to merge the states of the FST generated from RNNLM. e.g. if ngram-order = 4 + # then any history that shares last 3 words would be merged into one state +stage=1 +weight=0.5 # when we do lattice-rescoring, instead of replacing the lm-weights + # in the lattice with RNNLM weights, we usually do a linear combination of + # the 2 and the $weight variable indicates the weight for the RNNLM scores + +train_text=data/train/text +nwords=9999 +opts= +dir=data/tensorflow_fast_lstm + +. ./utils/parse_options.sh +. ./cmd.sh +. ./path.sh + +set -e + +mkdir -p $dir + +#steps/tfrnnlm/check_tensorflow_installed.sh + +if [ $stage -le 1 ]; then + local/tfrnnlm/rnnlm_data_prep.sh --train-text $train_text --nwords $nwords $dir +fi + +mkdir -p $dir +if [ $stage -le 2 ]; then +# the following script uses TensorFlow. You could use tools/extras/install_tensorflow_py.sh to install it + $train_cmd --gpu 1 --mem 20G $dir/train_rnnlm.log utils/parallel/limit_num_gpus.sh \ + python steps/tfrnnlm/lstm_fast.py --data-path=$dir --save-path=$dir/rnnlm --vocab-path=$dir/wordlist.rnn.final ${opts} +fi + +exit 0 + +final_lm=ami_fsh.o3g.kn +LM=$final_lm.pr1-7 + +if [ $stage -le 3 ]; then +# for decode_set in dev; do + for decode_set in dev eval; do + basedir=exp/$mic/nnet3/tdnn_sp/ + decode_dir=${basedir}/decode_${decode_set} + + # Lattice rescoring + steps/lmrescore_rnnlm_lat.sh \ + --cmd "$tfrnnlm_cmd --mem 16G" \ + --rnnlm-ver tensorflow --weight $weight --max-ngram-order $ngram_order \ + data/lang_$LM $dir \ + data/$mic/${decode_set}_hires ${decode_dir} \ + ${decode_dir}.unk.fast.tfrnnlm.lat.${ngram_order}gram.$weight & + + done +fi + +wait diff --git a/egs/fisher_english/s5/local/tfrnnlm/run_vanilla_rnnlm.sh b/egs/fisher_english/s5/local/tfrnnlm/run_vanilla_rnnlm.sh new file mode 100755 index 00000000000..27081b1b26f --- /dev/null +++ b/egs/fisher_english/s5/local/tfrnnlm/run_vanilla_rnnlm.sh @@ -0,0 +1,57 @@ +#!/bin/bash +ngram_order=4 # this option when used, the rescoring binary makes an approximation + # to merge the states of the FST generated from RNNLM. e.g. if ngram-order = 4 + # then any history that shares last 3 words would be merged into one state +stage=1 +weight=0.5 # when we do lattice-rescoring, instead of replacing the lm-weights + # in the lattice with RNNLM weights, we usually do a linear combination of + # the 2 and the $weight variable indicates the weight for the RNNLM scores + +train_text=data/train/text +nwords=9999 +opts= +dir=data/vanilla_tensorflow + +. ./utils/parse_options.sh +. ./cmd.sh +. ./path.sh + +set -e + +mkdir -p $dir + +#steps/tfrnnlm/check_tensorflow_installed.sh + +if [ $stage -le 1 ]; then + local/tfrnnlm/rnnlm_data_prep.sh --train-text $train_text --nwords $nwords $dir +fi + +mkdir -p $dir +if [ $stage -le 2 ]; then +# the following script uses TensorFlow. You could use tools/extras/install_tensorflow_py.sh to install it + $train_cmd --gpu 1 --mem 20G $dir/train_rnnlm.log utils/parallel/limit_num_gpus.sh \ + python steps/tfrnnlm/vanilla_rnnlm.py --data-path=$dir --save-path=$dir/rnnlm --vocab-path=$dir/wordlist.rnn.final ${opts} +fi + +exit 0 + +final_lm=ami_fsh.o3g.kn +LM=$final_lm.pr1-7 + +if [ $stage -le 3 ]; then + for decode_set in dev eval; do + basedir=exp/$mic/nnet3/tdnn_sp/ + decode_dir=${basedir}/decode_${decode_set} + + # Lattice rescoring + steps/lmrescore_rnnlm_lat.sh \ + --cmd "$tfrnnlm_cmd --mem 16G" \ + --rnnlm-ver tensorflow --weight $weight --max-ngram-order $ngram_order \ + data/lang_$LM $dir \ + data/$mic/${decode_set}_hires ${decode_dir} \ + ${decode_dir}.vanilla.tfrnnlm.lat.${ngram_order}gram.$weight & + + done +fi + +wait From a0572b5e7cc9cb51ffdd7d0a0ae64e35747f3eaf Mon Sep 17 00:00:00 2001 From: System User Date: Mon, 6 Nov 2017 02:29:56 -0500 Subject: [PATCH 089/174] semisup: Undeterminized lattices recipes --- .../s5/local/fisher_create_test_lang.sh | 26 +- .../s5/local/fisher_train_lms.sh | 9 +- .../semisup/chain/tuning/run_tdnn_100k_b.sh | 197 ++++++++ .../semisup/chain/tuning/run_tdnn_100k_c.sh | 198 ++++++++ .../chain/tuning/run_tdnn_100k_c_oracle.sh | 202 ++++++++ .../run_tdnn_100k_semisupervised_conf_a.sh | 463 +++++++++++++++++ .../run_tdnn_100k_semisupervised_conf_a2.sh | 464 +++++++++++++++++ .../run_tdnn_100k_semisupervised_conf_b.sh | 462 +++++++++++++++++ .../run_tdnn_100k_semisupervised_conf_c.sh | 463 +++++++++++++++++ .../chain/tuning/run_tdnn_15k_best_path_b.sh | 55 +- .../chain/tuning/run_tdnn_15k_best_path_c.sh | 11 +- .../semisup/chain/tuning/run_tdnn_15k_d.sh | 195 ++++++++ .../semisup/chain/tuning/run_tdnn_15k_e.sh | 195 ++++++++ .../semisup/chain/tuning/run_tdnn_15k_f.sh | 195 ++++++++ .../run_tdnn_15k_semisupervised_conf_aa.sh | 445 +++++++++++++++++ .../run_tdnn_15k_semisupervised_conf_s.sh | 72 ++- .../run_tdnn_15k_semisupervised_conf_y.sh | 472 ++++++++++++++++++ .../run_tdnn_15k_semisupervised_conf_z.sh | 445 +++++++++++++++++ .../run_tdnn_50k_semisupervised_conf_f.sh | 4 +- .../run_tdnn_50k_semisupervised_conf_h.sh | 452 +++++++++++++++++ .../semisup/nnet3/run_ivector_common_pca.sh | 134 +++++ .../s5/local/semisup/run_100k.sh | 21 +- egs/wsj/s5/steps/best_path_weights.sh | 6 +- egs/wsj/s5/steps/libs/nnet3/train/common.py | 5 +- egs/wsj/s5/steps/lmrescore_const_arpa.sh | 24 +- egs/wsj/s5/steps/nnet3/decode.sh | 10 +- .../lattice-determinize-phone-pruned.cc | 32 +- src/latbin/lattice-determinize-pruned.cc | 28 +- src/latbin/lattice-interp.cc | 14 +- src/latbin/lattice-lmrescore-const-arpa.cc | 77 +-- src/latbin/lattice-lmrescore.cc | 73 +-- 31 files changed, 5236 insertions(+), 213 deletions(-) create mode 100755 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_b.sh create mode 100755 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_c.sh create mode 100755 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_c_oracle.sh create mode 100644 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_conf_a.sh create mode 100644 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_conf_a2.sh create mode 100644 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_conf_b.sh create mode 100644 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_conf_c.sh create mode 100755 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_d.sh create mode 100755 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_e.sh create mode 100755 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_f.sh create mode 100644 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_aa.sh create mode 100644 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_y.sh create mode 100644 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_z.sh create mode 100755 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_h.sh create mode 100755 egs/fisher_english/s5/local/semisup/nnet3/run_ivector_common_pca.sh diff --git a/egs/fisher_english/s5/local/fisher_create_test_lang.sh b/egs/fisher_english/s5/local/fisher_create_test_lang.sh index 533a0949962..5b9e52f9b31 100755 --- a/egs/fisher_english/s5/local/fisher_create_test_lang.sh +++ b/egs/fisher_english/s5/local/fisher_create_test_lang.sh @@ -1,23 +1,24 @@ #!/bin/bash # -if [ -f path.sh ]; then . path.sh; fi +arpa_lm=data/local/lm/3gram-mincount/lm_unpruned.gz +dir=data/lang_test -mkdir -p data/lang_test +if [ -f path.sh ]; then . path.sh; fi +. utils/parse_options.sh -arpa_lm=data/local/lm/3gram-mincount/lm_unpruned.gz [ ! -f $arpa_lm ] && echo No such file $arpa_lm && exit 1; -mkdir -p data/lang_test -cp -r data/lang/* data/lang_test +mkdir -p $dir +cp -r data/lang/* $dir gunzip -c "$arpa_lm" | \ arpa2fst --disambig-symbol=#0 \ - --read-symbol-table=data/lang_test/words.txt - data/lang_test/G.fst + --read-symbol-table=$dir/words.txt - $dir/G.fst echo "Checking how stochastic G is (the first of these numbers should be small):" -fstisstochastic data/lang_test/G.fst +fstisstochastic $dir/G.fst ## Check lexicon. ## just have a look and make sure it seems sane. @@ -27,24 +28,21 @@ fstprint --isymbols=data/lang/phones.txt --osymbols=data/lang/words.txt data/l echo Performing further checks # Checking that G.fst is determinizable. -fstdeterminize data/lang_test/G.fst /dev/null || echo Error determinizing G. +fstdeterminize $dir/G.fst /dev/null || echo Error determinizing G. # Checking that L_disambig.fst is determinizable. -fstdeterminize data/lang_test/L_disambig.fst /dev/null || echo Error determinizing L. +fstdeterminize $dir/L_disambig.fst /dev/null || echo Error determinizing L. # Checking that disambiguated lexicon times G is determinizable # Note: we do this with fstdeterminizestar not fstdeterminize, as # fstdeterminize was taking forever (presumbaly relates to a bug # in this version of OpenFst that makes determinization slow for # some case). -fsttablecompose data/lang_test/L_disambig.fst data/lang_test/G.fst | \ +fsttablecompose $dir/L_disambig.fst $dir/G.fst | \ fstdeterminizestar >/dev/null || echo Error # Checking that LG is stochastic: -fsttablecompose data/lang/L_disambig.fst data/lang_test/G.fst | \ +fsttablecompose data/lang/L_disambig.fst $dir/G.fst | \ fstisstochastic || echo "[log:] LG is not stochastic" -utils/build_const_arpa_lm.sh data/local/lm/4gram-mincount/lm_unpruned.gz \ - data/lang_test data/lang_test_fg - echo "$0 succeeded" diff --git a/egs/fisher_english/s5/local/fisher_train_lms.sh b/egs/fisher_english/s5/local/fisher_train_lms.sh index 585680550f8..811e09dec6d 100755 --- a/egs/fisher_english/s5/local/fisher_train_lms.sh +++ b/egs/fisher_english/s5/local/fisher_train_lms.sh @@ -6,6 +6,14 @@ text=data/train_all/text lexicon=data/local/dict/lexicon.txt +dir=data/local/lm + +. utils/parse_options.sh + +if [ $# -ne 0 ]; then + echo "Usage: $0 [options]" + exit 1 +fi for f in "$text" "$lexicon"; do [ ! -f $x ] && echo "$0: No such file $f" && exit 1; @@ -17,7 +25,6 @@ done #data/train_all/text #data/local/dict/lexicon.txt -dir=data/local/lm mkdir -p $dir export LC_ALL=C # You'll get errors about things being not sorted, if you # have a different locale. diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_b.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_b.sh new file mode 100755 index 00000000000..22fc833f613 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_b.sh @@ -0,0 +1,197 @@ +#!/bin/bash +set -e + +# This is fisher chain recipe for training a model on a subset of around 100 hours. +# This is similar to _b, but uses an extra layer. + +# configs for 'chain' +stage=0 +tdnn_affix=7b +train_stage=-10 +get_egs_stage=-10 +decode_iter= +train_set=train_sup +ivector_train_set=train_sup +tree_affix= +nnet3_affix= +chain_affix= +exp=exp/semisup_100k +gmm=tri4a +xent_regularize=0.1 +hidden_dim=725 + +# training options +num_epochs=4 +remove_egs=false +common_egs_dir= +minibatch_size=128 + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate -1 \ + --cmd "$train_cmd" 4000 data/${train_set} $lang $gmm_dir $treedir || exit 1 +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + mkdir -p $dir/egs + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$common_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize 0.1 \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +decode_suff= +if [ $stage -le 15 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}${decode_suff} || exit 1; + ) & + done +fi +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_c.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_c.sh new file mode 100755 index 00000000000..f6b94ee594c --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_c.sh @@ -0,0 +1,198 @@ +#!/bin/bash +set -e + +# This is fisher chain recipe for training a model on a subset of around 100 hours. +# This is similar to _b, but uses a bi-phone tree with 7000 leaves + +# configs for 'chain' +stage=0 +tdnn_affix=7c +train_stage=-10 +get_egs_stage=-10 +decode_iter= +train_set=train_sup +ivector_train_set=train_sup +tree_affix=bi_a +nnet3_affix= +chain_affix= +exp=exp/semisup_100k +gmm=tri4a +xent_regularize=0.1 +hidden_dim=725 + +# training options +num_epochs=4 +remove_egs=false +common_egs_dir= +minibatch_size=128 + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate -1 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 7000 data/${train_set} $lang $gmm_dir $treedir || exit 1 +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + mkdir -p $dir/egs + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$common_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize 0.1 \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +decode_suff= +if [ $stage -le 15 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}${decode_suff} || exit 1; + ) & + done +fi +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_c_oracle.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_c_oracle.sh new file mode 100755 index 00000000000..696b26d1d5a --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_c_oracle.sh @@ -0,0 +1,202 @@ +#!/bin/bash +set -e + +# This is fisher chain recipe for training a model on a subset of around 100 hours. +# This is similar to _b, but uses an extra layer. + +# configs for 'chain' +stage=0 +tdnn_affix=7c_oracle +train_stage=-10 +get_egs_stage=-10 +decode_iter= +supervised_set=train_sup +unsupervised_set=train_unsup100k_250k_n10k +base_train_set=train_oracle100k_250k_n10k +ivector_train_set=train_sup +tree_affix=bi_a +nnet3_affix= +chain_affix= +exp=exp/semisup_100k +gmm=tri4a +xent_regularize=0.1 +hidden_dim=725 + +# training options +num_epochs=4 +remove_egs=false +common_egs_dir= +minibatch_size=128 + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ ! -f $treedir/final.mdl ]; then + echo "$0: Could not find $treedir/final.mdl" + exit 1 +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + mkdir -p $dir/egs + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$common_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize 0.1 \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +decode_suff= +if [ $stage -le 15 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}${decode_suff} || exit 1; + ) & + done +fi +wait; +exit 0; diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_conf_a.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_conf_a.sh new file mode 100644 index 00000000000..7d956149ef7 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_conf_a.sh @@ -0,0 +1,463 @@ +#!/bin/bash + +# This script is semi-supervised training with 100 hours supervised data +# and 250 hours unsupervised data with naive splitting. +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 3,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +exp=exp/semisup_100k + +supervised_set=train_sup +unsupervised_set=train_unsup100k_250k +semisup_train_set= # semisup100k_250k + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +nnet3_affix= # affix for nnet3 and chain dir -- relates to i-vector used + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +graph_affix=_sup100k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1a # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=3,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix= +unsup_egs_opts= +apply_deriv_weights=true + +do_finetuning=false + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size="150=128/300=64" +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +tree_affix= + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri4a_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${lang_test_suffix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_conf_a2.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_conf_a2.sh new file mode 100644 index 00000000000..2e6c8bc36f3 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_conf_a2.sh @@ -0,0 +1,464 @@ +#!/bin/bash + +# This script is semi-supervised training with 100 hours supervised data +# and 250 hours unsupervised data with naive splitting. +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 3,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +exp=exp/semisup_100k + +supervised_set=train_sup +unsupervised_set=train_unsup100k_250k +semisup_train_set= # semisup100k_250k + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +nnet3_affix= # affix for nnet3 and chain dir -- relates to i-vector used + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +graph_affix=_sup100k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1a2 # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=3,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix= +unsup_egs_opts= +apply_deriv_weights=true + +do_finetuning=false + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size="150=128/300=64" +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +tree_affix= + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri4a_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${lang_test_suffix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_conf_b.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_conf_b.sh new file mode 100644 index 00000000000..d38b97bfe43 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_conf_b.sh @@ -0,0 +1,462 @@ +#!/bin/bash + +# This script is similar to _a but uses smart splitting. +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 3,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +exp=exp/semisup_100k + +supervised_set=train_sup +unsupervised_set=train_unsup100k_250k +semisup_train_set= # semisup100k_250k + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +nnet3_affix= # affix for nnet3 and chain dir -- relates to i-vector used + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +graph_affix=_sup100k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1b # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=3,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix= +unsup_egs_opts= +apply_deriv_weights=true + +do_finetuning=false + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size="150=128/300=64" +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +tree_affix= + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri4a_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs_split.sh --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${lang_test_suffix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_conf_c.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_conf_c.sh new file mode 100644 index 00000000000..7790f004b64 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_conf_c.sh @@ -0,0 +1,463 @@ +#!/bin/bash + +# This script is semi-supervised training with 100 hours supervised data +# and 250 hours unsupervised data with naive splitting. +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 3,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +exp=exp/semisup_100k + +supervised_set=train_sup +unsupervised_set=train_unsup100k_250k +semisup_train_set= # semisup100k_250k + +tdnn_affix=7c # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +nnet3_affix= # affix for nnet3 and chain dir -- relates to i-vector used + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +graph_affix=_sup100k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1c # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=3,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix=bi_a +unsup_egs_opts= +apply_deriv_weights=true + +do_finetuning=false + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size="150=128/300=64" +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri4a_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${lang_test_suffix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_best_path_b.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_best_path_b.sh index 88988be9d1c..774ce524221 100755 --- a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_best_path_b.sh +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_best_path_b.sh @@ -42,7 +42,6 @@ tree_affix= unsup_egs_opts= apply_deriv_weights=true - extra_left_context=0 extra_right_context=0 @@ -55,6 +54,8 @@ minibatch_size="150=128/300=64" decode_iter= lang_test_suffix= +do_finetuning=false + finetune_stage=-2 finetune_suffix=_finetune finetune_iter=final @@ -109,6 +110,8 @@ fi unsupervised_set=${unsupervised_set}_240k +det_decode_affix=${graph_affix} + for dset in $unsupervised_set; do if [ $stage -le 3 ] && [ ! -f data/${dset}_sp_hires/feats.scp ]; then utils/data/perturb_data_dir_speed_3way.sh data/$dset data/${dset}_sp_hires_tmp @@ -125,22 +128,37 @@ for dset in $unsupervised_set; do $graphdir data/${dset}_sp_hires $chaindir/decode_${dset}_sp${decode_affix} fi - if [ $stage -le 6 ]; then - steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" --write-compact false \ - data/lang_test${graph_affix} \ - data/lang_test${graph_affix}_fg data/${dset}_sp_hires \ - $chaindir/decode_${dset}_sp${decode_affix} \ - $chaindir/decode_${dset}_sp${decode_affix}_fg + if [ ! -f $chaindir/decode_${dset}_sp${det_decode_affix}/lat.1.gz ]; then + out_dir=$chaindir/decode_${dset}_sp${det_decode_affix} + mkdir -p $out_dir + + if [ $stage -le 5 ]; then + $decode_cmd JOB=1:$decode_nj $out_dir/determinize_lattice.JOB.log \ + lattice-determinize-phone-pruned-non-compact --beam 8.0 \ + $chaindir/final.mdl \ + "ark:gunzip -c $chaindir/decode_${dset}_sp${det_decode_affix}/lat.JOB.gz |" \ + "ark:| gzip -c > $out_dir/lat.JOB.gz" || exit 1 + fi + fi - ln -s ../final.mdl $chaindir/decode_${dset}_sp${decode_affix}_fg/final.mdl || true + if [ ! -f $chaindir/decode_${dset}_sp${det_decode_affix}_fg/lat.1.gz ]; then + if [ $stage -le 6 ]; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" --write-compact false \ + data/lang_test${graph_affix} \ + data/lang_test${graph_affix}_fg data/${dset}_sp_hires \ + $chaindir/decode_${dset}_sp${det_decode_affix} \ + $chaindir/decode_${dset}_sp${det_decode_affix}_fg + fi fi done -if [ $stage -le 8 ]; then - steps/best_path_weights.sh --cmd "${train_cmd}" --acwt 0.1 \ - data/${unsupervised_set}_sp_hires data/lang_chain \ - $chaindir/decode_${unsupervised_set}_sp${decode_affix}_fg \ - $chaindir/best_path_${unsupervised_set}_sp${decode_affix}_fg +if [ ! -f $chaindir/best_path_${unsupervised_set}_sp${det_decode_affix}_fg/ali.1.gz ]; then + if [ $stage -le 8 ]; then + steps/best_path_weights.sh --cmd "${train_cmd}" --acwt 0.1 \ + data/${unsupervised_set}_sp_hires data/lang_chain \ + $chaindir/decode_${unsupervised_set}_sp${det_decode_affix}_fg \ + $chaindir/best_path_${unsupervised_set}_sp${det_decode_affix}_fg + fi fi frame_subsampling_factor=1 @@ -157,18 +175,18 @@ if [ ! -f $treedir/final.mdl ]; then exit 1 fi -this_nj=$(cat $chaindir/decode_${unsupervised_set}_sp${decode_affix}_fg/num_jobs) - +this_nj=$(cat $chaindir/decode_${unsupervised_set}_sp${decode_affix}/num_jobs) + if [ $stage -le 9 ]; then out_dir=$chaindir/best_path_lats_${unsupervised_set}_sp${decode_affix}_fg $train_cmd JOB=1:$this_nj $out_dir/log/get_best_path_lats.JOB.log \ lattice-interp "ark:gunzip -c $chaindir/decode_${unsupervised_set}_sp${decode_affix}/lat.JOB.gz |" \ - "ark:gunzip -c $chaindir/decode_${unsupervised_set}_sp${decode_affix}_fg/lat.JOB.gz | lattice-1best ark:- ark:- |" \ + "ark:gunzip -c $chaindir/decode_${unsupervised_set}_sp${det_decode_affix}_fg/lat.JOB.gz | lattice-1best --acoustic-scale=0.1 ark:- ark:- |" \ "ark:| gzip -c > $out_dir/lat.JOB.gz" echo $this_nj > $out_dir/num_jobs fi - + ln -sf ../final.mdl $chaindir/best_path_lats_${unsupervised_set}_sp${decode_affix}_fg/final.mdl echo $this_nj > $chaindir/best_path_lats_${unsupervised_set}_sp${decode_affix}_fg/num_jobs @@ -295,6 +313,7 @@ if [ -z "$unsup_egs_dir" ]; then utils/create_split_dir.pl \ /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage fi + mkdir -p $unsup_egs_dir touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. echo "$0: generating egs from the unsupervised data" @@ -306,7 +325,7 @@ if [ -z "$unsup_egs_dir" ]; then --frame-subsampling-factor $frame_subsampling_factor \ --cmvn-opts "$cmvn_opts" \ --phone-insertion-penalty "$phone_insertion_penalty" \ - --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${det_decode_affix}/weights.scp \ --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ --generate-egs-scp true $unsup_egs_opts \ data/${unsupervised_set}_hires $dir \ diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_best_path_c.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_best_path_c.sh index e33331cc428..6c5eb9679c6 100755 --- a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_best_path_c.sh +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_best_path_c.sh @@ -141,12 +141,12 @@ if [ $stage -le 8 ]; then data/${unsupervised_set}_sp_hires data/lang_chain \ $chaindir/decode_${unsupervised_set}_sp${decode_affix}_fg \ $chaindir/best_path_${unsupervised_set}_sp${decode_affix}_fg - - utils/copy_data_dir.sh data/${unsupervised_set} $chaindir/${unsupervised_set} - cp $chaindir/best_path_${unsupervised_set}_sp${decode_affix}_fg/words.txt \ - $chaindir/${unsupervised_set}/text fi +utils/copy_data_dir.sh data/${unsupervised_set}_sp_hires $chaindir/${unsupervised_set}_sp_hires +cp $chaindir/best_path_${unsupervised_set}_sp${decode_affix}_fg/text \ + $chaindir/${unsupervised_set}_sp_hires/text + frame_subsampling_factor=1 if [ -f $chaindir/frame_subsampling_factor ]; then frame_subsampling_factor=`cat $chaindir/frame_subsampling_factor` @@ -167,7 +167,7 @@ if [ $stage -le 9 ]; then --scale-opts "--transition-scale=1.0 --self-loop-scale=1.0" \ --acoustic-scale 1.0 \ --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ - $chaindir/${unsupervised_set} data/lang_chain \ + $chaindir/${unsupervised_set}_sp_hires data/lang_chain \ $chaindir $out_dir fi @@ -293,6 +293,7 @@ if [ -z "$unsup_egs_dir" ]; then utils/create_split_dir.pl \ /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage fi + mkdir -p $unsup_egs_dir touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. echo "$0: generating egs from the unsupervised data" diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_d.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_d.sh new file mode 100755 index 00000000000..16d9f12acf4 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_d.sh @@ -0,0 +1,195 @@ +#!/bin/bash +set -e + +# This is fisher chain recipe for training a model on a subset of around 10 hours. + +# configs for 'chain' +stage=0 +tdnn_affix=7d +train_stage=-10 +get_egs_stage=-10 +decode_iter= +train_set=train_sup15k +ivector_train_set=semisup15k_250k +tree_affix= +nnet3_affix=_semi15k_250k +chain_affix=_semi15k_250k +exp=exp/semisup_15k +gmm=tri3 +xent_regularize=0.1 +hidden_dim=500 + +# training options +num_epochs=10 +remove_egs=false +common_egs_dir= +minibatch_size=128 + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate -1 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 7000 data/${train_set} $lang $gmm_dir $treedir || exit 1 +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn5 dim=$hidden_dim target-rms=0.5 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn5 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + mkdir -p $dir/egs + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$common_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize 0.1 \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true" \ + --egs.chunk-width 160,140,110,80 \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +decode_suff= +if [ $stage -le 15 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}${decode_suff} || exit 1; + ) & + done +fi +wait; +exit 0; diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_e.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_e.sh new file mode 100755 index 00000000000..4f077e74410 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_e.sh @@ -0,0 +1,195 @@ +#!/bin/bash +set -e + +# This is fisher chain recipe for training a model on a subset of around 10 hours. + +# configs for 'chain' +stage=0 +tdnn_affix=7e +train_stage=-10 +get_egs_stage=-10 +decode_iter= +train_set=train_sup15k +ivector_train_set=semisup15k_250k +tree_affix=e +nnet3_affix=_semi15k_250k +chain_affix=_semi15k_250k +exp=exp/semisup_15k +gmm=tri3 +xent_regularize=0.1 +hidden_dim=500 + +# training options +num_epochs=10 +remove_egs=false +common_egs_dir= +minibatch_size=128 + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate -1 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 2000 data/${train_set} $lang $gmm_dir $treedir || exit 1 +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn5 dim=$hidden_dim target-rms=0.5 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn5 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + mkdir -p $dir/egs + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$common_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize 0.1 \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true" \ + --egs.chunk-width 160,140,110,80 \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +decode_suff= +if [ $stage -le 15 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}${decode_suff} || exit 1; + ) & + done +fi +wait; +exit 0; diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_f.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_f.sh new file mode 100755 index 00000000000..4dd2a38fd13 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_f.sh @@ -0,0 +1,195 @@ +#!/bin/bash +set -e + +# This is fisher chain recipe for training a model on a subset of around 10 hours. + +# configs for 'chain' +stage=0 +tdnn_affix=7f +train_stage=-10 +get_egs_stage=-10 +decode_iter= +train_set=train_sup15k +ivector_train_set=semisup15k_250k +tree_affix=f +nnet3_affix=_semi15k_250k +chain_affix=_semi15k_250k +exp=exp/semisup_15k +gmm=tri3 +xent_regularize=0.1 +hidden_dim=500 + +# training options +num_epochs=10 +remove_egs=false +common_egs_dir= +minibatch_size=128 + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate -1 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 4000 data/${train_set} $lang $gmm_dir $treedir || exit 1 +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn5 dim=$hidden_dim target-rms=0.5 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn5 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + mkdir -p $dir/egs + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$common_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize 0.1 \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true" \ + --egs.chunk-width 160,140,110,80 \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +decode_suff= +if [ $stage -le 15 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}${decode_suff} || exit 1; + ) & + done +fi +wait; +exit 0; diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_aa.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_aa.sh new file mode 100644 index 00000000000..7e7a734e4fb --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_aa.sh @@ -0,0 +1,445 @@ +#!/bin/bash + +# This script is same as _z, but uses 7d as seed model and bi_d tree. +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 5,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup15k_250k # for reference +exp=exp/semisup_15k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup15k +semi_affix=semi15k_250k # affix relating train-set splitting proportion + +tdnn_affix=7d # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix=_unphdet +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1aa # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix=bi_d +unsup_egs_opts= +apply_deriv_weights=true + +do_finetuning=false + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size="150=128/300=64" +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +tree_affix= + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri3_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs_split.sh --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${lang_test_suffix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_s.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_s.sh index 5c991e7770c..3bd721ba32c 100644 --- a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_s.sh +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_s.sh @@ -37,12 +37,13 @@ phone_insertion_penalty= comb_affix=comb1s # affix for new chain-model directory trained on the combined supervised+unsupervised subsets supervision_weights=1.0,1.0 lm_weights=5,2 -sup_egs_dir= +sup_egs_dir= unsup_egs_dir= tree_affix= unsup_egs_opts= apply_deriv_weights=true +do_finetuning=false extra_left_context=0 extra_right_context=0 @@ -110,6 +111,8 @@ fi unsupervised_set=${unsupervised_set}_240k +det_decode_affix=${graph_affix} + for dset in $unsupervised_set; do if [ $stage -le 3 ] && [ ! -f data/${dset}_sp_hires/feats.scp ]; then utils/data/perturb_data_dir_speed_3way.sh data/$dset data/${dset}_sp_hires_tmp @@ -126,14 +129,41 @@ for dset in $unsupervised_set; do $graphdir data/${dset}_sp_hires $chaindir/decode_${dset}_sp${decode_affix} fi - if [ $stage -le 6 ]; then - steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" --write-compact false \ - data/lang_test${graph_affix} \ - data/lang_test${graph_affix}_fg data/${dset}_sp_hires \ - $chaindir/decode_${dset}_sp${decode_affix} \ - $chaindir/decode_${dset}_sp${decode_affix}_fg + if [ ! -f $chaindir/decode_${dset}_sp${det_decode_affix}/lat.1.gz ]; then + out_dir=$chaindir/decode_${dset}_sp${det_decode_affix} + mkdir -p $out_dir + + if [ $stage -le 5 ]; then + $decode_cmd JOB=1:$decode_nj $out_dir/determinize_lattice.JOB.log \ + lattice-determinize-phone-pruned-non-compact --acoustic-scale=0.1 --beam=8.0 \ + $chaindir/final.mdl \ + "ark:gunzip -c $chaindir/decode_${dset}_sp${det_decode_affix}/lat.JOB.gz |" \ + "ark:| gzip -c > $out_dir/lat.JOB.gz" || exit 1 + fi + fi + + if [ ! -f $chaindir/decode_${dset}_sp${det_decode_affix}_fg/lat.1.gz ]; then + if [ $stage -le 6 ]; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" --write-compact false \ + data/lang_test${graph_affix} \ + data/lang_test${graph_affix}_fg data/${dset}_sp_hires \ + $chaindir/decode_${dset}_sp${det_decode_affix} \ + $chaindir/decode_${dset}_sp${det_decode_affix}_fg + fi + fi + + if [ $stage -le 7 ]; then + out_dir=$chaindir/decode_${dset}_sp${decode_affix}_fg + mkdir -p $out_dir - ln -s ../final.mdl $chaindir/decode_${dset}_sp${decode_affix}_fg/final.mdl || true + $decode_cmd JOB=1:$decode_nj $out_dir/log/compose_lat.JOB.log \ + lattice-interp --alpha=0 --write-compact=false \ + "ark:gunzip -c $chaindir/decode_${dset}_sp${decode_affix}/lat.JOB.gz |" \ + "ark:gunzip -c $chaindir/decode_${dset}_sp${det_decode_affix}_fg/lat.JOB.gz |" \ + "ark:| gzip -c > $out_dir/lat.JOB.gz" + echo $decode_nj > $out_dir/num_jobs + + ln -sf ../final.mdl $out_dir/final.mdl || true fi done @@ -142,8 +172,8 @@ decode_affix=${decode_affix}_fg if [ $stage -le 8 ]; then steps/best_path_weights.sh --cmd "${train_cmd}" --acwt 0.1 \ data/${unsupervised_set}_sp_hires data/lang_chain \ - $chaindir/decode_${unsupervised_set}_sp${decode_affix} \ - $chaindir/best_path_${unsupervised_set}_sp${decode_affix} + $chaindir/decode_${unsupervised_set}_sp${det_decode_affix} \ + $chaindir/best_path_${unsupervised_set}_sp${det_decode_affix} fi frame_subsampling_factor=1 @@ -155,11 +185,9 @@ cmvn_opts=`cat $chaindir/cmvn_opts` || exit 1 sup_ali_dir=$exp/tri3 treedir=$exp/chain${nnet3_affix}/tree_${tree_affix} -if [ $stage -le 9 ]; then - if [ -f $treedir/final.mdl ]; then - echo "$0: $treedir/final.mdl already exists. Remove it and try again." - exit 1 - fi +if [ ! -f $treedir/final.mdl ]; then + echo "$0: $treedir/final.mdl does not exist." + exit 1 fi dir=$exp/chain${nnet3_affix}/tdnn${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} @@ -167,12 +195,12 @@ dir=$exp/chain${nnet3_affix}/tdnn${tdnn_affix}${decode_affix}${egs_affix}${comb_ if [ $stage -le 10 ]; then steps/subset_ali_dir.sh --cmd "$train_cmd" \ data/${unsupervised_set} data/${unsupervised_set}_sp_hires \ - $chaindir/best_path_${unsupervised_set}_sp${decode_affix} \ - $chaindir/best_path_${unsupervised_set}${decode_affix} - echo $frame_subsampling_factor > $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor + $chaindir/best_path_${unsupervised_set}_sp${det_decode_affix} \ + $chaindir/best_path_${unsupervised_set}${det_decode_affix} + echo $frame_subsampling_factor > $chaindir/best_path_${unsupervised_set}${det_decode_affix}/frame_subsampling_factor steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ - ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${det_decode_affix} \ $dir fi @@ -249,6 +277,7 @@ if [ -z "$sup_egs_dir" ]; then utils/create_split_dir.pl \ /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage fi + mkdir -p $sup_egs_dir/ touch $sup_egs_dir/.nodelete # keep egs around when that run dies. echo "$0: generating egs from the supervised data" @@ -281,10 +310,11 @@ if [ -z "$unsup_egs_dir" ]; then utils/create_split_dir.pl \ /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage fi + mkdir -p $unsup_egs_dir touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. echo "$0: generating egs from the unsupervised data" - steps/nnet3/chain/get_egs_split.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + steps/nnet3/chain/get_egs_split.sh --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ --left-tolerance $tolerance --right-tolerance $tolerance \ --left-context $left_context --right-context $right_context \ --left-context-initial $left_context_initial --right-context-final $right_context_final \ @@ -293,7 +323,7 @@ if [ -z "$unsup_egs_dir" ]; then --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ --lattice-prune-beam "$lattice_prune_beam" \ --phone-insertion-penalty "$phone_insertion_penalty" \ - --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${det_decode_affix}/weights.scp \ --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ --generate-egs-scp true $unsup_egs_opts \ data/${unsupervised_set}_hires $dir \ diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_y.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_y.sh new file mode 100644 index 00000000000..ec537d2f84b --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_y.sh @@ -0,0 +1,472 @@ +#!/bin/bash + +# This script is same as _s, but uses naive splitting +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 5,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup15k_250k # for reference +exp=exp/semisup_15k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup15k +semi_affix=semi15k_250k # affix relating train-set splitting proportion + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix=_unphdet +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1y # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix= +unsup_egs_opts= +apply_deriv_weights=true + +do_finetuning=false + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size="150=128/300=64" +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +tree_affix= + +RANDOM=0 + +if ! cuda-compiled; then + cat < $out_dir/lat.JOB.gz" || exit 1 + fi + fi + + if [ ! -f $chaindir/decode_${dset}_sp${det_decode_affix}_fg/lat.1.gz ]; then + if [ $stage -le 6 ]; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" --write-compact false \ + data/lang_test${graph_affix} \ + data/lang_test${graph_affix}_fg data/${dset}_sp_hires \ + $chaindir/decode_${dset}_sp${det_decode_affix} \ + $chaindir/decode_${dset}_sp${det_decode_affix}_fg + fi + fi + + if [ $stage -le 7 ]; then + out_dir=$chaindir/decode_${dset}_sp${decode_affix}_fg + mkdir -p $out_dir + + $decode_cmd JOB=1:$decode_nj $out_dir/log/compose_lat.JOB.log \ + lattice-interp --alpha=0 --write-compact=false \ + "ark:gunzip -c $chaindir/decode_${dset}_sp${decode_affix}/lat.JOB.gz |" \ + "ark:gunzip -c $chaindir/decode_${dset}_sp${det_decode_affix}_fg/lat.JOB.gz |" \ + "ark:| gzip -c > $out_dir/lat.JOB.gz" + echo $decode_nj > $out_dir/num_jobs + + ln -sf ../final.mdl $out_dir/final.mdl || true + fi +done + +decode_affix=${decode_affix}_fg + +if [ $stage -le 8 ]; then + steps/best_path_weights.sh --cmd "${train_cmd}" --acwt 0.1 \ + data/${unsupervised_set}_sp_hires data/lang_chain \ + $chaindir/decode_${unsupervised_set}_sp${det_decode_affix} \ + $chaindir/best_path_${unsupervised_set}_sp${det_decode_affix} +fi + +frame_subsampling_factor=1 +if [ -f $chaindir/frame_subsampling_factor ]; then + frame_subsampling_factor=`cat $chaindir/frame_subsampling_factor` +fi +cmvn_opts=`cat $chaindir/cmvn_opts` || exit 1 + +sup_ali_dir=$exp/tri3 + +treedir=$exp/chain${nnet3_affix}/tree_${tree_affix} +if [ ! -f $treedir/final.mdl ]; then + echo "$0: $treedir/final.mdl does not exist." + exit 1 +fi + +dir=$exp/chain${nnet3_affix}/tdnn${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} + +if [ $stage -le 10 ]; then + steps/subset_ali_dir.sh --cmd "$train_cmd" \ + data/${unsupervised_set} data/${unsupervised_set}_sp_hires \ + $chaindir/best_path_${unsupervised_set}_sp${det_decode_affix} \ + $chaindir/best_path_${unsupervised_set}${det_decode_affix} + echo $frame_subsampling_factor > $chaindir/best_path_${unsupervised_set}${det_decode_affix}/frame_subsampling_factor + + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${det_decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri3_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${det_decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${lang_test_suffix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_z.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_z.sh new file mode 100644 index 00000000000..0d201a5ace8 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_z.sh @@ -0,0 +1,445 @@ +#!/bin/bash + +# This script is same as _d, but uses a weight of 1.0 for unsupervised set. +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 5,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup15k_250k # for reference +exp=exp/semisup_15k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup15k +semi_affix=semi15k_250k # affix relating train-set splitting proportion + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix=_unphdet +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1z # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix= +unsup_egs_opts= +apply_deriv_weights=true + +do_finetuning=false + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size="150=128/300=64" +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +tree_affix= + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri3_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs_split.sh --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${lang_test_suffix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_f.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_f.sh index 6662ec058e4..00f5b2556e7 100755 --- a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_f.sh +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_f.sh @@ -49,6 +49,8 @@ do_finetuning=false extra_left_context=0 extra_right_context=0 +train_extra_opts= + xent_regularize=0.1 hidden_dim=725 minibatch_size=128 @@ -341,7 +343,7 @@ if [ $stage -le 15 ]; then --feat-dir data/${supervised_set}_hires \ --tree-dir $treedir \ --lat-dir $sup_lat_dir \ - --dir $dir || exit 1; + --dir $dir ${train_extra_opts} || exit 1; fi graph_dir=$dir/graph diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_h.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_h.sh new file mode 100755 index 00000000000..c1fb23eb970 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_h.sh @@ -0,0 +1,452 @@ +#!/bin/bash + +# This script is same as _g, but split lattice supervision +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 3,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup50k_250k # for reference +exp=exp/semisup_50k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup50k +semi_affix=semi50k_250k # affix relating train-set splitting proportion + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix=_undet +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1h # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=3,2 +sup_egs_dir= +unsup_egs_dir= +comb_egs_dir= +tree_affix= +unsup_egs_opts= +apply_deriv_weights=true + +do_finetuning=false + +extra_left_context=0 +extra_right_context=0 + +train_extra_opts= + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size="150=128/300=64" +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +tree_affix= + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri4a_${supervised_set}_lats +if [ -z "$comb_egs_dir" ] && [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + if [ -z "$comb_egs_dir" ]; then + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) + fi +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} + +if [ -z "$comb_egs_dir" ] && [ -z "$unsup_egs_dir" ]; then + [ -z $unsup_frames_per_eg ] && [ ! -z "$frames_per_eg" ] && unsup_frames_per_eg=$frames_per_eg + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs_split.sh --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +if [ -z "$comb_egs_dir" ]; then + comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + + if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. + fi +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir ${train_extra_opts} || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${lang_test_suffix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/nnet3/run_ivector_common_pca.sh b/egs/fisher_english/s5/local/semisup/nnet3/run_ivector_common_pca.sh new file mode 100755 index 00000000000..99648a93f08 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/nnet3/run_ivector_common_pca.sh @@ -0,0 +1,134 @@ +#!/bin/bash + +. ./cmd.sh +set -e +stage=1 +speed_perturb=true +train_set=train + +unsup_train_set= +semisup_train_set= + +nnet3_affix= +exp=exp + +. ./path.sh +. ./utils/parse_options.sh + +if [ ! -z "$unsup_train_set" ] && [ -z "$semisup_train_set" ]; then + echo "$0: --semisup-train-set must be provided if --unsup-train-set is provided" + exit 1 +fi + +if [ -z "$unsup_train_set" ] && [ ! -z "$semisup_train_set" ]; then + echo "$0: --unsup-train-set must be provided if --semisup-train-set is provided" + exit 1 +fi + +if [ ! -z "$unsup_train_set" ]; then + utils/combine_data.sh data/$semisup_train_set \ + data/$train_set data/$unsup_train_set +fi + +# perturbed data preparation +if [ "$speed_perturb" == "true" ]; then + if [ $stage -le 1 ]; then + # Although the nnet will be trained by high resolution data, we still have + # to perturb the normal data to get the alignments. + # _sp stands for speed-perturbed + + for datadir in ${train_set} ${unsup_train_set}; do + utils/data/perturb_data_dir_speed_3way.sh data/${datadir} data/${datadir}_sp + utils/fix_data_dir.sh data/${datadir}_sp + + mfccdir=mfcc_perturbed + steps/make_mfcc.sh --cmd "$train_cmd" --nj 50 \ + data/${datadir}_sp exp/make_mfcc/${datadir}_sp $mfccdir || exit 1; + steps/compute_cmvn_stats.sh data/${datadir}_sp exp/make_mfcc/${datadir}_sp $mfccdir || exit 1; + utils/fix_data_dir.sh data/${datadir}_sp + done + fi +fi + +if [ ! -z "$unsup_train_set" ]; then + utils/combine_data.sh data/${semisup_train_set}_sp \ + data/${train_set}_sp data/${unsup_train_set}_sp +fi + +if [ $stage -le 3 ]; then + mfccdir=mfcc_hires + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then + date=$(date +'%m_%d_%H_%M') + utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/egs/fisher_english-$date/s5b/$mfccdir/storage $mfccdir/storage + fi + + for dataset in $train_set $unsup_train_set; do + utils/copy_data_dir.sh data/${dataset}_sp data/${dataset}_sp_hires + utils/data/perturb_data_dir_volume.sh data/${dataset}_sp_hires + + steps/make_mfcc.sh --nj 70 --mfcc-config conf/mfcc_hires.conf \ + --cmd "$train_cmd" data/${dataset}_sp_hires exp/make_hires/${dataset}_sp $mfccdir; + steps/compute_cmvn_stats.sh data/${dataset}_sp_hires exp/make_hires/${dataset}_sp $mfccdir; + + # Remove the small number of utterances that couldn't be extracted for some + # reason (e.g. too short; no such file). + utils/fix_data_dir.sh data/${dataset}_sp_hires; + done + + for dataset in test dev; do + # Create MFCCs for the eval set + utils/copy_data_dir.sh data/$dataset data/${dataset}_hires + steps/make_mfcc.sh --cmd "$train_cmd" --nj 10 --mfcc-config conf/mfcc_hires.conf \ + data/${dataset}_hires exp/make_hires/$dataset $mfccdir; + steps/compute_cmvn_stats.sh data/${dataset}_hires exp/make_hires/$dataset $mfccdir; + utils/fix_data_dir.sh data/${dataset}_hires # remove segments with problems + done +fi + +ivector_train_set=${train_set}_sp +if [ ! -z "$unsup_train_set" ]; then + utils/combine_data.sh data/${semisup_train_set}_sp_hires \ + data/${train_set}_sp_hires data/${unsup_train_set}_sp_hires + ivector_train_set=${semisup_train_set}_sp +fi + +# ivector extractor training +if [ $stage -le 4 ]; then + steps/online/nnet2/get_pca_transform.sh --cmd "$train_cmd" \ + --splice-opts "--left-context=3 --right-context=3" \ + --max-utts 10000 --subsample 2 \ + data/${ivector_train_set}_hires \ + $exp/nnet3${nnet3_affix}/pca_transform +fi + +if [ $stage -le 5 ]; then + steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 --num-frames 200000 \ + data/${ivector_train_set}_hires 512 \ + $exp/nnet3${nnet3_affix}/pca_transform $exp/nnet3${nnet3_affix}/diag_ubm +fi + +if [ $stage -le 6 ]; then + steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \ + data/${ivector_train_set}_hires $exp/nnet3${nnet3_affix}/diag_ubm $exp/nnet3${nnet3_affix}/extractor || exit 1; +fi + +if [ $stage -le 7 ]; then + # We extract iVectors on all the ${train_set} data, which will be what we + # train the system on. + # having a larger number of speakers is helpful for generalization, and to + # handle per-utterance decoding well (iVector starts at zero). + steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2 data/${ivector_train_set}_hires data/${ivector_train_set}_max2_hires + + steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \ + data/${ivector_train_set}_max2_hires $exp/nnet3${nnet3_affix}/extractor $exp/nnet3${nnet3_affix}/ivectors_${ivector_train_set}_hires || exit 1; +fi + +if [ $stage -le 8 ]; then + for dataset in test dev; do + steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \ + data/${dataset}_hires $exp/nnet3${nnet3_affix}/extractor $exp/nnet3${nnet3_affix}/ivectors_${dataset}_hires || exit 1; + done +fi + +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/run_100k.sh b/egs/fisher_english/s5/local/semisup/run_100k.sh index 4a023b627dc..b7a6f7c0f61 100644 --- a/egs/fisher_english/s5/local/semisup/run_100k.sh +++ b/egs/fisher_english/s5/local/semisup/run_100k.sh @@ -14,7 +14,7 @@ train_stage=-10 set -o pipefail exp=exp/semisup_100k -true && { +false && { utils/subset_data_dir.sh --shortest data/train_sup 100000 data/train_sup_100kshort utils/subset_data_dir.sh data/train_sup_100kshort 10000 data/train_sup_10k utils/data/remove_dup_utts.sh 100 data/train_sup_10k data/train_sup_10k_nodup @@ -71,18 +71,29 @@ steps/train_sat.sh --cmd "$train_cmd" \ utils/copy_data_dir.sh data/train_unsup250k data/train_unsup100k_250k utils/combine_data.sh data/semisup100k_250k data/train_sup \ data/train_unsup250k || exit 1 +} local/semisup/chain/tuning/run_tdnn_100k.sh \ --train-set train_sup \ --stage $stage --train-stage $train_stage \ --exp $exp \ --ivector-train-set train_sup || exit 1 -} + +local/fisher_train_lms.sh --text data/train_sup/text \ + --dir data/local/lm_sup100k + +local/fisher_create_test_lang.sh \ + --arpa-lm data/local/lm_sup100k/3gram-mincount/lm_unpruned.gz \ + --dir data/lang_test_sup100k + +utils/build_const_arpa_lm.sh \ + data/local/lm_sup100k/4gram-mincount/lm_unpruned.gz \ + data/lang_test_sup100k data/lang_test_sup100k_fg false && local/semisup/chain/tuning/run_tdnn_oracle.sh \ - --train-set semisup15k_250k \ - --nnet3-affix _semi15k_250k \ - --chain-affix _semi15k_250k_oracle \ + --train-set train_sup \ + --nnet3-affix \ + --chain-affix \ --stage 9 --train-stage $train_stage \ --exp $exp \ --ivector-train-set semisup15k_250k || exit 1 diff --git a/egs/wsj/s5/steps/best_path_weights.sh b/egs/wsj/s5/steps/best_path_weights.sh index 8e10298a4f3..d39d367191f 100755 --- a/egs/wsj/s5/steps/best_path_weights.sh +++ b/egs/wsj/s5/steps/best_path_weights.sh @@ -74,7 +74,7 @@ mkdir -p $dir words_wspecifier=ark:/dev/null if $write_words; then - words_wspecifier="ark,t:| utils/int2sym.pl -f 2- $lang/words.txt > words.JOB.txt" + words_wspecifier="ark,t:| utils/int2sym.pl -f 2- $lang/words.txt > $dir/text.JOB" fi if [ $stage -lt -1 ]; then @@ -166,8 +166,8 @@ done if $write_words; then for n in `seq $nj`; do - cat $dir/words.$n.txt - done > $dir/words.txt + cat $dir/text.$n + done > $dir/text fi exit 0 diff --git a/egs/wsj/s5/steps/libs/nnet3/train/common.py b/egs/wsj/s5/steps/libs/nnet3/train/common.py index 667b0d5e1ca..e921ee5afc1 100644 --- a/egs/wsj/s5/steps/libs/nnet3/train/common.py +++ b/egs/wsj/s5/steps/libs/nnet3/train/common.py @@ -921,7 +921,10 @@ def __init__(self, dest='objective_scales', type=str, action=common_lib.NullstrToNoneAction, - help="Objective scales for different outputs") + help="""Objective scales for the outputs + specified as a comma-separated list of pairs + :,:... + This will be passed to the training binary.""") # General options self.parser.add_argument("--stage", type=int, default=-4, diff --git a/egs/wsj/s5/steps/lmrescore_const_arpa.sh b/egs/wsj/s5/steps/lmrescore_const_arpa.sh index 97f8379c7df..de9fa481aa2 100755 --- a/egs/wsj/s5/steps/lmrescore_const_arpa.sh +++ b/egs/wsj/s5/steps/lmrescore_const_arpa.sh @@ -11,6 +11,10 @@ skip_scoring=false stage=1 scoring_opts= write_compact=true +acwt=0.1 +beam=8.0 +read_determinized=true +write_determinized=true # End configuration section. echo "$0 $@" # Print the command line for logging @@ -52,12 +56,28 @@ mkdir -p $outdir/log nj=`cat $indir/num_jobs` || exit 1; cp $indir/num_jobs $outdir +lats_rspecifier="ark:gunzip -c $indir/lat.JOB.gz|" +if ! $read_determinized; then + lats_rspecifier="$lats_rspecifier lattice-determinize-pruned --acoustic-scale=$acwt --beam=$beam --write-compact=$write_compact ark:- ark:- |" +fi + +lattice_copy_cmd= +if ! $write_determinized; then + if $read_determinized; then + echo "$0: --write-determinized false does not make sense when --read-determinized true is specified" + echo "$0: ignoring the option --write-determinized" + else + lattice_copy_cmd="ark:- | lattice-interp --alpha=0 --alpha-acoustic=1.0 --write-compact=$write_compact \"ark:gunzip -c $indir/lat.JOB.gz |\" ark,s,cs:- " + fi +fi + if [ $stage -le 1 ]; then $cmd JOB=1:$nj $outdir/log/rescorelm.JOB.log \ lattice-lmrescore --lm-scale=-1.0 --write-compact=$write_compact \ - "ark:gunzip -c $indir/lat.JOB.gz|" "$oldlmcommand" ark:- \| \ + "$lats_rspecifier" "$oldlmcommand" ark:- \| \ lattice-lmrescore-const-arpa --lm-scale=1.0 --write-compact=$write_compact \ - ark:- "$newlm" "ark,t:|gzip -c>$outdir/lat.JOB.gz" || exit 1; + ark:- "$newlm" $lattice_copy_cmd \ + "ark:|gzip -c>$outdir/lat.JOB.gz" || exit 1; fi if ! $skip_scoring && [ $stage -le 2 ]; then diff --git a/egs/wsj/s5/steps/nnet3/decode.sh b/egs/wsj/s5/steps/nnet3/decode.sh index 7d4097789a7..fca0d0158e4 100755 --- a/egs/wsj/s5/steps/nnet3/decode.sh +++ b/egs/wsj/s5/steps/nnet3/decode.sh @@ -121,16 +121,16 @@ if [ ! -z "$online_ivector_dir" ]; then fi extra_opts= -lat_wspecifier="ark:|" +lats_wspecifier="ark:|" if ! $write_compact; then extra_opts="--determinize-lattice=false" - lat_wspecifier="ark:| lattice-determinize-phone-pruned-non-compact --beam=$lattice_beam --acoustic-scale=$acwt --minimize=$minimize $determinize_opts $model ark:- ark:- |" + lats_wspecifier="ark:| lattice-determinize-phone-pruned-non-compact --beam=$lattice_beam --acoustic-scale=$acwt --minimize=$minimize $determinize_opts $model ark:- ark:- |" fi if [ "$post_decode_acwt" == 1.0 ]; then - lat_wspecifier="$lat_wspecifier gzip -c >$dir/lat.JOB.gz" + lats_wspecifier="$lats_wspecifier gzip -c >$dir/lat.JOB.gz" else - lat_wspecifier="$lat_wspecifier lattice-scale --acoustic-scale=$post_decode_acwt --write-compact=$write_compact ark:- ark:- | gzip -c >$dir/lat.JOB.gz" + lats_wspecifier="$lats_wspecifier lattice-scale --acoustic-scale=$post_decode_acwt --write-compact=$write_compact ark:- ark:- | gzip -c >$dir/lat.JOB.gz" fi frame_subsampling_opt= @@ -151,7 +151,7 @@ if [ $stage -le 1 ]; then --lattice-beam=$lattice_beam --acoustic-scale=$acwt --allow-partial=true \ --word-symbol-table=$graphdir/words.txt ${extra_opts} \ "$model" \ - $graphdir/HCLG.fst "$feats" "$lat_wspecifier" || exit 1; + $graphdir/HCLG.fst "$feats" "$lats_wspecifier" || exit 1; fi diff --git a/src/latbin/lattice-determinize-phone-pruned.cc b/src/latbin/lattice-determinize-phone-pruned.cc index 0959bcbcd74..21a29aa466b 100644 --- a/src/latbin/lattice-determinize-phone-pruned.cc +++ b/src/latbin/lattice-determinize-phone-pruned.cc @@ -1,6 +1,7 @@ // latbin/lattice-determinize-phone-pruned.cc // Copyright 2014 Guoguo Chen +// 2017 Vimal Manohar // See ../../COPYING for clarification regarding multiple authors // @@ -43,11 +44,13 @@ int main(int argc, char *argv[]) { " final.mdl ark:in.lats ark:det.lats\n"; ParseOptions po(usage); + bool write_compact = true; BaseFloat acoustic_scale = 1.0; BaseFloat beam = 10.0; fst::DeterminizeLatticePhonePrunedOptions opts; opts.max_mem = 50000000; + po.Register("write-compact", &write_compact, "If true, write in normal (compact) form."); po.Register("acoustic-scale", &acoustic_scale, "Scaling factor for acoustic" " likelihoods."); po.Register("beam", &beam, "Pruning beam [applied after acoustic scaling]."); @@ -70,8 +73,13 @@ int main(int argc, char *argv[]) { // accepts. SequentialLatticeReader lat_reader(lats_rspecifier); - // Writes as compact lattice. - CompactLatticeWriter compact_lat_writer(lats_wspecifier); + CompactLatticeWriter compact_lat_writer; + LatticeWriter lat_writer; + + if (write_compact) + compact_lat_writer.Open(lats_wspecifier); + else + lat_writer.Open(lats_wspecifier); int32 n_done = 0, n_warn = 0; @@ -89,6 +97,11 @@ int main(int argc, char *argv[]) { KALDI_VLOG(2) << "Processing lattice " << key; + // Compute a map from each (t, tid) to (sum_of_acoustic_scores, count) + unordered_map, std::pair, + PairHasher > acoustic_scores; + ComputeAcousticScoresMap(lat, &acoustic_scores); + fst::ScaleLattice(fst::AcousticLatticeScale(acoustic_scale), &lat); CompactLattice det_clat; @@ -106,8 +119,19 @@ int main(int argc, char *argv[]) { sum_depth_out += depth * t; sum_t += t; - fst::ScaleLattice(fst::AcousticLatticeScale(1.0/acoustic_scale), &det_clat); - compact_lat_writer.Write(key, det_clat); + if (write_compact) { + fst::ScaleLattice(fst::AcousticLatticeScale(1.0/acoustic_scale), &det_clat); + compact_lat_writer.Write(key, det_clat); + } else{ + Lattice out_lat; + fst::ConvertLattice(det_clat, &out_lat); + + // Replace each arc (t, tid) with the averaged acoustic score from + // the computed map + ReplaceAcousticScoresFromMap(acoustic_scores, &out_lat); + lat_writer.Write(key, out_lat); + } + n_done++; } diff --git a/src/latbin/lattice-determinize-pruned.cc b/src/latbin/lattice-determinize-pruned.cc index 3e8bca5a3ce..393d98059f5 100644 --- a/src/latbin/lattice-determinize-pruned.cc +++ b/src/latbin/lattice-determinize-pruned.cc @@ -39,6 +39,7 @@ int main(int argc, char *argv[]) { " e.g.: lattice-determinize-pruned --acoustic-scale=0.1 --beam=6.0 ark:in.lats ark:det.lats\n"; ParseOptions po(usage); + bool write_compact = true; BaseFloat acoustic_scale = 1.0; BaseFloat beam = 10.0; bool minimize = false; @@ -48,6 +49,7 @@ int main(int argc, char *argv[]) { opts.max_mem = 50000000; opts.max_loop = 0; // was 500000; + po.Register("write-compact", &write_compact, "If true, write in normal (compact) form."); po.Register("acoustic-scale", &acoustic_scale, "Scaling factor for acoustic likelihoods"); po.Register("beam", &beam, "Pruning beam [applied after acoustic scaling]."); @@ -70,7 +72,12 @@ int main(int argc, char *argv[]) { SequentialLatticeReader lat_reader(lats_rspecifier); // Write as compact lattice. - CompactLatticeWriter compact_lat_writer(lats_wspecifier); + CompactLatticeWriter compact_lat_writer; + LatticeWriter lat_writer; + if (write_compact) + compact_lat_writer.Open(lats_wspecifier); + else + lat_writer.Open(lats_wspecifier); int32 n_done = 0, n_warn = 0; @@ -87,6 +94,11 @@ int main(int argc, char *argv[]) { KALDI_VLOG(2) << "Processing lattice " << key; + // Compute a map from each (t, tid) to (sum_of_acoustic_scores, count) + unordered_map, std::pair, + PairHasher > acoustic_scores; + ComputeAcousticScoresMap(lat, &acoustic_scores); + Invert(&lat); // so word labels are on the input side. lat_reader.FreeCurrent(); fst::ScaleLattice(fst::AcousticLatticeScale(acoustic_scale), &lat); @@ -121,8 +133,18 @@ int main(int argc, char *argv[]) { sum_depth_out += depth * t; sum_t += t; - fst::ScaleLattice(fst::AcousticLatticeScale(1.0/acoustic_scale), &det_clat); - compact_lat_writer.Write(key, det_clat); + if (write_compact) { + fst::ScaleLattice(fst::AcousticLatticeScale(1.0/acoustic_scale), &det_clat); + compact_lat_writer.Write(key, det_clat); + } else { + Lattice out_lat; + fst::ConvertLattice(det_clat, &out_lat); + + // Replace each arc (t, tid) with the averaged acoustic score from + // the computed map + ReplaceAcousticScoresFromMap(acoustic_scores, &out_lat); + lat_writer.Write(key, out_lat); + } n_done++; } diff --git a/src/latbin/lattice-interp.cc b/src/latbin/lattice-interp.cc index edb5d02be7e..ebb6bd40198 100644 --- a/src/latbin/lattice-interp.cc +++ b/src/latbin/lattice-interp.cc @@ -48,9 +48,13 @@ int main(int argc, char *argv[]) { ParseOptions po(usage); bool write_compact = true; BaseFloat alpha = 0.5; // Scale of 1st in the pair. + BaseFloat alpha_acoustic = kLogZeroBaseFloat; po.Register("write-compact", &write_compact, "If true, write in normal (compact) form."); po.Register("alpha", &alpha, "Scale of the first lattice in the pair (should be in range [0, 1])"); + po.Register("alpha-acoustic", &alpha_acoustic, + "If specified, then alpha will be used for graph scores and " + "alpha_acoustic will be used for acoustic scores (should be in range [0, 1])"); po.Read(argc, argv); if (po.NumArgs() != 3) { @@ -58,6 +62,12 @@ int main(int argc, char *argv[]) { exit(1); } + if (alpha_acoustic == kLogZeroBaseFloat) { + alpha_acoustic = alpha; + } + + KALDI_ASSERT(alpha_acoustic <= 1.0 && alpha_acoustic >= 0.0); + std::string lats_rspecifier1 = po.GetArg(1), lats_rspecifier2 = po.GetArg(2), lats_wspecifier = po.GetArg(3); @@ -84,7 +94,7 @@ int main(int argc, char *argv[]) { PairHasher > acoustic_scores; if (!write_compact) ComputeAcousticScoresMap(lat1, &acoustic_scores); - ScaleLattice(fst::LatticeScale(alpha, alpha), &lat1); + ScaleLattice(fst::LatticeScale(alpha, alpha_acoustic), &lat1); ArcSort(&lat1, fst::OLabelCompare()); @@ -97,7 +107,7 @@ int main(int argc, char *argv[]) { Lattice lat2; ConvertLattice(clat2, &lat2); fst::Project(&lat2, fst::PROJECT_OUTPUT); // project on words. - ScaleLattice(fst::LatticeScale(1.0-alpha, 1.0-alpha), &lat2); + ScaleLattice(fst::LatticeScale(1.0-alpha, 1.0-alpha_acoustic), &lat2); ArcSort(&lat2, fst::ILabelCompare()); Lattice lat3; diff --git a/src/latbin/lattice-lmrescore-const-arpa.cc b/src/latbin/lattice-lmrescore-const-arpa.cc index 06da0ba9027..4613e805b8a 100644 --- a/src/latbin/lattice-lmrescore-const-arpa.cc +++ b/src/latbin/lattice-lmrescore-const-arpa.cc @@ -44,13 +44,10 @@ int main(int argc, char *argv[]) { " const_arpa ark:out.lats\n"; ParseOptions po(usage); - bool write_compact = true, determinize = true; + bool write_compact = true; BaseFloat lm_scale = 1.0; po.Register("write-compact", &write_compact, "If true, write in normal (compact) form."); - po.Register("determinize", &determinize, "If false, then the output will retain " - "all the non-deterministic paths in the original lattice, " - "but rescored with LM score from the best path in the LM"); po.Register("lm-scale", &lm_scale, "Scaling factor for language model " "costs; frequently 1.0 or -1.0"); @@ -119,16 +116,6 @@ int main(int argc, char *argv[]) { fst::ScaleLattice(fst::GraphLatticeScale(1.0/lm_scale), &clat); ArcSort(&clat, fst::OLabelCompare()); - // A copy of the lattice is needed if determinize=false - CompactLattice clat_copy; - if (!determinize) { - clat_copy = clat; - // Remove graph scores from the lattice so that the composed lattice - // will not have any graph scores other than the one from the - // new LM. They will be added back later. - fst::ScaleLattice(fst::GraphLatticeScale(0.0), &clat); - } - // Wraps the ConstArpaLm format language model into FST. We re-create it // for each lattice to prevent memory usage increasing with time. ConstArpaLmDeterministicFst const_arpa_fst(const_arpa); @@ -144,66 +131,24 @@ int main(int argc, char *argv[]) { Invert(&composed_lat); CompactLattice determinized_clat; DeterminizeLattice(composed_lat, &determinized_clat); + fst::ScaleLattice(fst::GraphLatticeScale(lm_scale), &determinized_clat); if (determinized_clat.Start() == fst::kNoStateId) { KALDI_WARN << "Empty lattice for utterance " << key << " (incompatible LM?)"; n_fail++; } else { - if (!determinize) { - RemoveAlignmentsFromCompactLattice(&determinized_clat); - - Lattice lat2; - ConvertLattice(determinized_clat, &lat2); - fst::Project(&lat2, fst::PROJECT_OUTPUT); // project on words - fst::ArcSort(&lat2, fst::ILabelCompare()); - - // Avoid double counting of acoustic scores - fst::ScaleLattice(fst::AcousticLatticeScale(0.0), &lat2); - - Lattice lat1; - ConvertLattice(clat_copy, &lat1); - fst::ArcSort(&lat1, fst::OLabelCompare()); - + if (write_compact) { + compact_lattice_writer.Write(key, determinized_clat); + } else { Lattice out_lat; + fst::ConvertLattice(determinized_clat, &out_lat); - // out_lat will have the original acoustic and graph scores - // (after scaling by lm_scale below) along with LM scores - // from lat2 - Compose(lat1, lat2, &out_lat); - - fst::ScaleLattice(fst::GraphLatticeScale(lm_scale), &out_lat); - - if (out_lat.Start() == fst::kNoStateId) { // empty composition. - KALDI_WARN << "For utterance " << key << ", composed result is empty."; - n_fail++; - } else { - if (write_compact) { - CompactLattice out_clat; - ConvertLattice(out_lat, &out_clat); - compact_lattice_writer.Write(key, out_clat); - } else { - // Replace each arc (t, tid) with the averaged acoustic score from - // the computed map - ReplaceAcousticScoresFromMap(acoustic_scores, &out_lat); - lattice_writer.Write(key, out_lat); - } - n_done++; - } - } else { - fst::ScaleLattice(fst::GraphLatticeScale(lm_scale), &determinized_clat); - if (write_compact) { - compact_lattice_writer.Write(key, determinized_clat); - } else { - Lattice out_lat; - fst::ConvertLattice(determinized_clat, &out_lat); - - // Replace each arc (t, tid) with the averaged acoustic score from - // the computed map - ReplaceAcousticScoresFromMap(acoustic_scores, &out_lat); - lattice_writer.Write(key, out_lat); - } - n_done++; + // Replace each arc (t, tid) with the averaged acoustic score from + // the computed map + ReplaceAcousticScoresFromMap(acoustic_scores, &out_lat); + lattice_writer.Write(key, out_lat); } + n_done++; } } else { // Zero scale so nothing to do. diff --git a/src/latbin/lattice-lmrescore.cc b/src/latbin/lattice-lmrescore.cc index ba7c84dd13c..d9367a55480 100644 --- a/src/latbin/lattice-lmrescore.cc +++ b/src/latbin/lattice-lmrescore.cc @@ -44,14 +44,11 @@ int main(int argc, char *argv[]) { " e.g.: lattice-lmrescore --lm-scale=-1.0 ark:in.lats 'fstproject --project_output=true data/lang/G.fst|' ark:out.lats\n"; ParseOptions po(usage); - bool write_compact = true, determinize = true; + bool write_compact = true; BaseFloat lm_scale = 1.0; int32 num_states_cache = 50000; po.Register("write-compact", &write_compact, "If true, write in normal (compact) form."); - po.Register("determinize", &determinize, "If false, then the output will retain " - "all the non-deterministic paths in the original lattice, " - "but rescored with LM score from the best path in the LM"); po.Register("lm-scale", &lm_scale, "Scaling factor for language model costs; frequently 1.0 or -1.0"); po.Register("num-states-cache", &num_states_cache, "Number of states we cache when mapping LM FST to lattice type. " @@ -136,16 +133,6 @@ int main(int argc, char *argv[]) { fst::ScaleLattice(fst::GraphLatticeScale(1.0 / lm_scale), &lat); ArcSort(&lat, fst::OLabelCompare()); - // A copy of the lattice is needed if determinize=false - Lattice lat_copy; - if (!determinize) { - lat_copy = lat; - // Remove graph scores from the lattice so that the composed lattice - // will not have any graph scores other than the one from the - // new LM. They will be added back later. - fst::ScaleLattice(fst::GraphLatticeScale(0.0), &lat); - } - Lattice composed_lat; // Could just do, more simply: Compose(lat, lm_fst, &composed_lat); // and not have lm_compose_cache at all. @@ -157,61 +144,23 @@ int main(int argc, char *argv[]) { CompactLattice determinized_clat; DeterminizeLattice(composed_lat, &determinized_clat); + fst::ScaleLattice(fst::GraphLatticeScale(lm_scale), &determinized_clat); if (determinized_clat.Start() == fst::kNoStateId) { KALDI_WARN << "Empty lattice for utterance " << key << " (incompatible LM?)"; n_fail++; } else { - if (!determinize) { - RemoveAlignmentsFromCompactLattice(&determinized_clat); - Lattice lat2; - ConvertLattice(determinized_clat, &lat2); - fst::Project(&lat2, fst::PROJECT_OUTPUT); // project on words - fst::ArcSort(&lat2, fst::ILabelCompare()); - - // Avoid double counting of acoustic scores - fst::ScaleLattice(fst::AcousticLatticeScale(0.0), &lat2); - - Lattice out_lat; - - // out_lat will have the original acoustic and graph scores - // (after scaling by lm_scale below) along with LM scores - // added from lat2 - Compose(lat_copy, lat2, &out_lat); - - fst::ScaleLattice(fst::GraphLatticeScale(lm_scale), &out_lat); - - if (out_lat.Start() == fst::kNoStateId) { // empty composition. - KALDI_WARN << "For utterance " << key << ", composed result is empty."; - n_fail++; - } else { - if (write_compact) { - CompactLattice out_clat; - ConvertLattice(out_lat, &out_clat); - compact_lattice_writer.Write(key, out_clat); - } else { - // Replace each arc (t, tid) with the averaged acoustic score from - // the computed map - ReplaceAcousticScoresFromMap(acoustic_scores, &out_lat); - lattice_writer.Write(key, out_lat); - } - n_done++; - } + if (write_compact) { + compact_lattice_writer.Write(key, determinized_clat); } else { - fst::ScaleLattice(fst::GraphLatticeScale(lm_scale), &determinized_clat); - if (write_compact) { - compact_lattice_writer.Write(key, determinized_clat); - } else { - Lattice out_lat; - fst::ConvertLattice(determinized_clat, &out_lat); - - // Replace each arc (t, tid) with the averaged acoustic score from - // the computed map - ReplaceAcousticScoresFromMap(acoustic_scores, &out_lat); - lattice_writer.Write(key, out_lat); - } - n_done++; + // Replace each arc (t, tid) with the averaged acoustic score from + // the computed map + Lattice out_lat; + ConvertLattice(determinized_clat, &out_lat); + ReplaceAcousticScoresFromMap(acoustic_scores, &out_lat); + lattice_writer.Write(key, out_lat); } + n_done++; } } else { // Zero scale so nothing to do. From 8a035ab5cb6487ab212dd76ffa1b2dc133b2ea35 Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Mon, 6 Nov 2017 02:31:38 -0500 Subject: [PATCH 090/174] semisup-smbr: Bug fix in 15k_s --- .../chain/tuning/run_tdnn_15k_semisupervised_conf_s.sh | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_s.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_s.sh index 5c991e7770c..25ba2ad299f 100644 --- a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_s.sh +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_s.sh @@ -121,13 +121,15 @@ for dset in $unsupervised_set; do echo "$0: getting the decoding lattices for the unsupervised subset using the chain model at: $chaindir" steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \ --acwt 1.0 --post-decode-acwt 10.0 --write-compact false \ + --determinize-opts "--word-determinize=false" \ --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ - --scoring-opts "--min-lmwt 10 --max-lmwt 10" --determinize-opts "--word-determinize=false" \ + --scoring-opts "--min-lmwt 10 --max-lmwt 10" --skip-scoring true \ $graphdir data/${dset}_sp_hires $chaindir/decode_${dset}_sp${decode_affix} fi - if [ $stage -le 6 ]; then + if [ $stage -le 5 ]; then steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" --write-compact false \ + --read-determinized false --write-determinized false --skip-scoring true \ data/lang_test${graph_affix} \ data/lang_test${graph_affix}_fg data/${dset}_sp_hires \ $chaindir/decode_${dset}_sp${decode_affix} \ From 34f780abf2c47c7a59a233fcc5403b2e4fa48ed4 Mon Sep 17 00:00:00 2001 From: System User Date: Mon, 6 Nov 2017 17:44:37 -0500 Subject: [PATCH 091/174] semisup-smbr: Adding undeterminized version of rescoring --- .../run_tdnn_100k_semisupervised_conf_b.sh | 3 +- .../run_tdnn_100k_semisupervised_conf_d.sh | 462 ++++++++++++++++++ .../run_tdnn_15k_semisupervised_conf_ab.sh | 445 +++++++++++++++++ .../run_tdnn_50k_semisupervised_conf_i.sh | 443 +++++++++++++++++ .../run_tdnn_50k_semisupervised_conf_j.sh | 452 +++++++++++++++++ .../lmrescore_const_arpa_undeterminized.sh | 88 ++++ 6 files changed, 1892 insertions(+), 1 deletion(-) create mode 100644 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_conf_d.sh create mode 100644 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_ab.sh create mode 100755 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_i.sh create mode 100755 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_j.sh create mode 100755 egs/wsj/s5/steps/lmrescore_const_arpa_undeterminized.sh diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_conf_b.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_conf_b.sh index d38b97bfe43..609e2009280 100644 --- a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_conf_b.sh +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_conf_b.sh @@ -145,7 +145,8 @@ for dset in $unsupervised_set; do if [ $stage -le 6 ]; then steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ - --write-compact false --determinize false --skip-scoring true \ + --write-compact false --read-determinized false --write-determinized false \ + --skip-scoring true \ data/lang_test${graph_affix} \ data/lang_test${graph_affix}_fg data/${dset}_sp_hires \ $chaindir/decode_${dset}_sp${decode_affix} \ diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_conf_d.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_conf_d.sh new file mode 100644 index 00000000000..4e2df5e5d16 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_conf_d.sh @@ -0,0 +1,462 @@ +#!/bin/bash + +# This script is similar to _c, but uses smart splitting. +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 3,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +exp=exp/semisup_100k + +supervised_set=train_sup +unsupervised_set=train_unsup100k_250k +semisup_train_set= # semisup100k_250k + +tdnn_affix=7c # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +nnet3_affix= # affix for nnet3 and chain dir -- relates to i-vector used + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +graph_affix=_sup100k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1d # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=3,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix=bi_a +unsup_egs_opts= +apply_deriv_weights=true + +do_finetuning=false + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size="150=128/300=64" +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri4a_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs_split.sh --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${lang_test_suffix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_ab.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_ab.sh new file mode 100644 index 00000000000..f3626571de9 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_ab.sh @@ -0,0 +1,445 @@ +#!/bin/bash + +# This script is same as _z, but does rescoring correctly. +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 5,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup15k_250k # for reference +exp=exp/semisup_15k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup15k +semi_affix=semi15k_250k # affix relating train-set splitting proportion + +tdnn_affix=7d # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix=_unphdet +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1ab # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix= +unsup_egs_opts= +apply_deriv_weights=true + +do_finetuning=false + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size="150=128/300=64" +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +tree_affix= + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri3_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs_split.sh --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${lang_test_suffix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_i.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_i.sh new file mode 100755 index 00000000000..a114d09463a --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_i.sh @@ -0,0 +1,443 @@ +#!/bin/bash + +# This script is same as _h, but uses 3-gram LM. +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 3,2 +# LM for decoding unsupervised data: 3gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup50k_250k # for reference +exp=exp/semisup_50k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup50k +semi_affix=semi50k_250k # affix relating train-set splitting proportion + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix=_undet +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1h # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=3,2 +sup_egs_dir= +unsup_egs_dir= +comb_egs_dir= +tree_affix= +unsup_egs_opts= +apply_deriv_weights=true + +do_finetuning=false + +extra_left_context=0 +extra_right_context=0 + +train_extra_opts= + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size="150=128/300=64" +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +tree_affix= + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}_fg/frame_subsampling_factor +fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix}_fg \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri4a_${supervised_set}_lats +if [ -z "$comb_egs_dir" ] && [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + if [ -z "$comb_egs_dir" ]; then + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) + fi +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} + +if [ -z "$comb_egs_dir" ] && [ -z "$unsup_egs_dir" ]; then + [ -z $unsup_frames_per_eg ] && [ ! -z "$frames_per_eg" ] && unsup_frames_per_eg=$frames_per_eg + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs_split.sh --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}_fg/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +if [ -z "$comb_egs_dir" ]; then + comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + + if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. + fi +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir ${train_extra_opts} || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${lang_test_suffix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_j.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_j.sh new file mode 100755 index 00000000000..123464c34ab --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_j.sh @@ -0,0 +1,452 @@ +#!/bin/bash + +# This script is same as _h, but uses 3-gram LM. +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 3,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup50k_250k # for reference +exp=exp/semisup_50k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup50k +semi_affix=semi50k_250k # affix relating train-set splitting proportion + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix=_undet +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1j # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=3,2 +sup_egs_dir= +unsup_egs_dir= +comb_egs_dir= +tree_affix= +unsup_egs_opts= +apply_deriv_weights=true + +do_finetuning=false + +extra_left_context=0 +extra_right_context=0 + +train_extra_opts= + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size="150=128/300=64" +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +tree_affix= + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}_fg/frame_subsampling_factor +fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix}_fg \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri4a_${supervised_set}_lats +if [ -z "$comb_egs_dir" ] && [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + if [ -z "$comb_egs_dir" ]; then + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) + fi +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} + +if [ -z "$comb_egs_dir" ] && [ -z "$unsup_egs_dir" ]; then + [ -z $unsup_frames_per_eg ] && [ ! -z "$frames_per_eg" ] && unsup_frames_per_eg=$frames_per_eg + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs_split.sh --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}_fg/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +if [ -z "$comb_egs_dir" ]; then + comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + + if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. + fi +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir ${train_extra_opts} || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${lang_test_suffix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/wsj/s5/steps/lmrescore_const_arpa_undeterminized.sh b/egs/wsj/s5/steps/lmrescore_const_arpa_undeterminized.sh new file mode 100755 index 00000000000..5dd2f1ac682 --- /dev/null +++ b/egs/wsj/s5/steps/lmrescore_const_arpa_undeterminized.sh @@ -0,0 +1,88 @@ +#!/bin/bash + +# Copyright 2014 Guoguo Chen +# 2017 Vimal Manohar +# Apache 2.0 + +# This script rescores lattices with the ConstArpaLm format language model. + +# Begin configuration section. +cmd=run.pl +skip_scoring=false +stage=1 +scoring_opts= +write_compact=true +acwt=0.1 +beam=8.0 +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +. ./utils/parse_options.sh + +if [ $# != 5 ]; then + echo "Does language model rescoring of lattices (remove old LM, add new LM)" + echo "Usage: $0 [options] \\" + echo " " + echo "options: [--cmd (run.pl|queue.pl [queue opts])]" + exit 1; +fi + +[ -f path.sh ] && . ./path.sh; + +oldlang=$1 +newlang=$2 +data=$3 +indir=$4 +outdir=$5 + +oldlm=$oldlang/G.fst +newlm=$newlang/G.carpa +! cmp $oldlang/words.txt $newlang/words.txt &&\ + echo "$0: Warning: vocabularies may be incompatible." +[ ! -f $oldlm ] && echo "$0: Missing file $oldlm" && exit 1; +[ ! -f $newlm ] && echo "$0: Missing file $newlm" && exit 1; +! ls $indir/lat.*.gz >/dev/null &&\ + echo "$0: No lattices input directory $indir" && exit 1; + +if ! cmp -s $oldlang/words.txt $newlang/words.txt; then + echo "$0: $oldlang/words.txt and $newlang/words.txt differ: make sure you know what you are doing."; +fi + +oldlmcommand="fstproject --project_output=true $oldlm |" + +mkdir -p $outdir/log +nj=`cat $indir/num_jobs` || exit 1; +cp $indir/num_jobs $outdir + +if [ $stage -le 1 ]; then + $cmd JOB=1:$nj $outdir/log/remove_lm_costs.JOB.log \ + lattice-determinize-pruned --acoustic-scale=$acwt --beam=$beam --write-compact=$write_compact \ + "ark:gunzip -c $indir/lat.JOB.gz |" ark:- \| \ + lattice-lmrescore --lm-scale=-1.0 --write-compact=$write_compact \ + ark:- "$oldlmcommand" ark:- \| \ + lattice-interp --alpha=0.5 --alpha-acoustic=1.0 --write-compact=$write_compact \ + "ark:gunzip -c $indir/lat.JOB.gz |" ark,s,cs:- ark:- \| \ + lattice-scale --lm-scale=2.0 ark:- \ + "ark:| gzip -c > $outdir/lat_nolm.JOB.gz" || exit 1 +fi + +if [ $stage -le 2 ]; then + $cmd JOB=1:$nj $outdir/log/rescorelm.JOB.log \ + lattice-lmrescore-const-arpa --lm-scale=1.0 --write-compact=$write_compact \ + "ark:gunzip -c $outdir/lat_nolm.JOB.gz |" "$newlm" ark:- \| \ + lattice-interp --alpha=0.5 --alpha-acoustic=1.0 --write-compact=$write_compact \ + "ark:gunzip -c $outdir/lat_nolm.JOB.gz |" ark,s,cs:- ark:- \| \ + lattice-scale --lm-scale=2.0 ark:- \ + "ark:| gzip -c > $outdir/lat.JOB.gz" || exit 1 +fi + +if ! $skip_scoring && [ $stage -le 2 ]; then + err_msg="Not scoring because local/score.sh does not exist or not executable." + [ ! -x local/score.sh ] && echo $err_msg && exit 1; + local/score.sh --cmd "$cmd" $scoring_opts $data $newlang $outdir +else + echo "Not scoring because requested so..." +fi + +exit 0; From 06510759773091a44cf58af33246502a569aeffb Mon Sep 17 00:00:00 2001 From: System User Date: Mon, 6 Nov 2017 18:41:31 -0500 Subject: [PATCH 092/174] semisup-smbr: Fix undeterminized lattice rescoring --- .../lmrescore_const_arpa_undeterminized.sh | 24 ++-- .../lmrescore_rnnlm_lat_undeterminized.sh | 132 ++++++++++++++++++ 2 files changed, 140 insertions(+), 16 deletions(-) create mode 100755 egs/wsj/s5/steps/lmrescore_rnnlm_lat_undeterminized.sh diff --git a/egs/wsj/s5/steps/lmrescore_const_arpa_undeterminized.sh b/egs/wsj/s5/steps/lmrescore_const_arpa_undeterminized.sh index 5dd2f1ac682..f487d135696 100755 --- a/egs/wsj/s5/steps/lmrescore_const_arpa_undeterminized.sh +++ b/egs/wsj/s5/steps/lmrescore_const_arpa_undeterminized.sh @@ -13,7 +13,7 @@ stage=1 scoring_opts= write_compact=true acwt=0.1 -beam=8.0 +beam=8.0 # beam used in determinization # End configuration section. echo "$0 $@" # Print the command line for logging @@ -56,24 +56,16 @@ nj=`cat $indir/num_jobs` || exit 1; cp $indir/num_jobs $outdir if [ $stage -le 1 ]; then - $cmd JOB=1:$nj $outdir/log/remove_lm_costs.JOB.log \ - lattice-determinize-pruned --acoustic-scale=$acwt --beam=$beam --write-compact=$write_compact \ + $cmd JOB=1:$nj $outdir/log/rescorelm.JOB.log \ + lattice-determinize-pruned --acoustic-scale=$acwt --beam=$beam \ "ark:gunzip -c $indir/lat.JOB.gz |" ark:- \| \ - lattice-lmrescore --lm-scale=-1.0 --write-compact=$write_compact \ - ark:- "$oldlmcommand" ark:- \| \ + lattice-scale --lm-scale=0.0 ark:- ark:- \| \ + lattice-lmrescore --lm-scale=-1.0 ark:- "$oldlmcommand" ark:- \| \ + lattice-lmrescore-const-arpa --lm-scale=1.0 \ + ark:- "$newlm" ark:- \| \ lattice-interp --alpha=0.5 --alpha-acoustic=1.0 --write-compact=$write_compact \ "ark:gunzip -c $indir/lat.JOB.gz |" ark,s,cs:- ark:- \| \ - lattice-scale --lm-scale=2.0 ark:- \ - "ark:| gzip -c > $outdir/lat_nolm.JOB.gz" || exit 1 -fi - -if [ $stage -le 2 ]; then - $cmd JOB=1:$nj $outdir/log/rescorelm.JOB.log \ - lattice-lmrescore-const-arpa --lm-scale=1.0 --write-compact=$write_compact \ - "ark:gunzip -c $outdir/lat_nolm.JOB.gz |" "$newlm" ark:- \| \ - lattice-interp --alpha=0.5 --alpha-acoustic=1.0 --write-compact=$write_compact \ - "ark:gunzip -c $outdir/lat_nolm.JOB.gz |" ark,s,cs:- ark:- \| \ - lattice-scale --lm-scale=2.0 ark:- \ + lattice-scale --lm-scale=2.0 --write-compact=$write_compact ark:- \ "ark:| gzip -c > $outdir/lat.JOB.gz" || exit 1 fi diff --git a/egs/wsj/s5/steps/lmrescore_rnnlm_lat_undeterminized.sh b/egs/wsj/s5/steps/lmrescore_rnnlm_lat_undeterminized.sh new file mode 100755 index 00000000000..58217a75e75 --- /dev/null +++ b/egs/wsj/s5/steps/lmrescore_rnnlm_lat_undeterminized.sh @@ -0,0 +1,132 @@ +#!/bin/bash + +# Copyright 2015 Guoguo Chen +# 2017 Hainan Xu +# Apache 2.0 + +# This script rescores lattices with RNNLM. See also rnnlmrescore.sh which is +# an older script using n-best lists. + +# Begin configuration section. +cmd=run.pl +skip_scoring=false +max_ngram_order=4 +N=10 +inv_acwt=12 +weight=1.0 # Interpolation weight for RNNLM. + +expand_ngram=false +beam= +write_compact=true +# End configuration section. +rnnlm_ver= +#layer_string= + +echo "$0 $@" # Print the command line for logging + +. ./utils/parse_options.sh + +if [ $# != 5 ]; then + echo "Does language model rescoring of lattices (remove old LM, add new LM)" + echo "with RNNLM." + echo "" + echo "Usage: $0 [options] \\" + echo " " + echo " e.g.: $0 ./rnnlm data/lang_tg data/test \\" + echo " exp/tri3/test_tg exp/tri3/test_rnnlm" + echo "options: [--cmd (run.pl|queue.pl [queue opts])]" + exit 1; +fi + +[ -f path.sh ] && . ./path.sh; + +oldlang=$1 +rnnlm_dir=$2 +data=$3 +indir=$4 +outdir=$5 + +rescoring_binary=lattice-lmrescore-rnnlm + +first_arg=ark:$rnnlm_dir/unk.probs # this is for mikolov's rnnlm +extra_arg= + +if [ "$rnnlm_ver" == "cuedrnnlm" ]; then + layer_string=`cat $rnnlm_dir/layer_string | sed "s=:= =g"` + total_size=`wc -l $rnnlm_dir/unigram.counts | awk '{print $1}'` + rescoring_binary="lattice-lmrescore-cuedrnnlm" + cat $rnnlm_dir/rnnlm.input.wlist.index | tail -n +2 | awk '{print $1-1,$2}' > $rnnlm_dir/rnn.wlist + extra_arg="--full-voc-size=$total_size --layer-sizes=\"$layer_string\"" + first_arg=$rnnlm_dir/rnn.wlist +fi + +if [ "$rnnlm_ver" == "tensorflow" ]; then + rescoring_binary="lattice-lmrescore-tf-rnnlm" + first_arg="$rnnlm_dir/unk.probs $rnnlm_dir/wordlist.rnn.final" +fi + +oldlm=$oldlang/G.fst +if [ -f $oldlang/G.carpa ]; then + oldlm=$oldlang/G.carpa +elif [ ! -f $oldlm ]; then + echo "$0: expecting either $oldlang/G.fst or $oldlang/G.carpa to exist" &&\ + exit 1; +fi + +[ ! -f $oldlm ] && echo "$0: Missing file $oldlm" && exit 1; +[ ! -f $rnnlm_dir/rnnlm ] && [ ! -d $rnnlm_dir/rnnlm ] && echo "$0: Missing file $rnnlm_dir/rnnlm" && exit 1; +[ ! -f $rnnlm_dir/unk.probs ] &&\ + echo "$0: Missing file $rnnlm_dir/unk.probs" && exit 1; +[ ! -f $oldlang/words.txt ] &&\ + echo "$0: Missing file $oldlang/words.txt" && exit 1; +! ls $indir/lat.*.gz >/dev/null &&\ + echo "$0: No lattices input directory $indir" && exit 1; +awk -v n=$0 -v w=$weight 'BEGIN {if (w < 0 || w > 1) { + print n": Interpolation weight should be in the range of [0, 1]"; exit 1;}}' \ + || exit 1; + +if [ "$oldlm" == "$oldlang/G.fst" ]; then + lmscore_removing_binary=lattice-lmrescore + oldlm="fstproject --project_output=true $oldlm |" +else + lmscore_removing_binary=lattice-lmrescore-const-arpa +fi + +acwt=`perl -e "print (1.0/$inv_acwt);"` + +mkdir -p $outdir/log +nj=`cat $indir/num_jobs` || exit 1; +cp $indir/num_jobs $outdir + +lattice_expand_cmd= +if $expand_ngram; then + lattice_expand_cmd="| lattice-expand-ngram --n=$max_ngram_order ark:- ark:-" +fi + +oldlm_weight=`perl -e "print -1.0 * $weight;"` + +if [ $stage -le 1 ]; then + $cmd JOB=1:$nj $outdir/log/rescorelm.JOB.log \ + lattice-determinize-pruned --acoustic-scale=$acwt ${beam:+--beam=$beam} \ + "ark:gunzip -c $indir/lat.JOB.gz |" ark:- \| \ + lattice-scale --lm-scale=0.0 ark:- ark:- \| \ + $lmscore_removing_binary --lm-scale=$oldlm_weight \ + ark:- "$oldlm" ark:- $lattice_expand_cmd \| \ + $rescoring_binary $extra_arg --lm-scale=$weight \ + --max-ngram-order=$max_ngram_order \ + $first_arg $oldlang/words.txt ark:- "$rnnlm_dir/rnnlm" ark:- \| \ + lattice-interp --alpha=0.5 --alpha-acoustic=1.0 --write-compact=$write_compact \ + "ark:gunzip -c $indir/lat.JOB.gz |" ark,s,cs:- ark:- \| \ + lattice-scale --lm-scale=2.0 --write-compact=$write_compact ark:- \ + "ark:| gzip -c > $outdir/lat.JOB.gz" || exit 1 +fi + +if ! $skip_scoring ; then + err_msg="Not scoring because local/score.sh does not exist or not executable." + [ ! -x local/score.sh ] && echo $err_msg && exit 1; + local/score.sh --cmd "$cmd" $data $oldlang $outdir +else + echo "Not scoring because requested so..." +fi + +exit 0; From eadc843a4d365cb4c09b9de6ab9b8ac23c593b61 Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Sat, 11 Nov 2017 20:00:09 -0500 Subject: [PATCH 093/174] semisup: 50 hours recipe --- .../s5/local/semisup/run_50k.sh | 86 +++++++++++++++++++ 1 file changed, 86 insertions(+) create mode 100644 egs/fisher_english/s5/local/semisup/run_50k.sh diff --git a/egs/fisher_english/s5/local/semisup/run_50k.sh b/egs/fisher_english/s5/local/semisup/run_50k.sh new file mode 100644 index 00000000000..eeb6f3fd3c4 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/run_50k.sh @@ -0,0 +1,86 @@ +#!/bin/bash + +# Copyright 2017 Vimal Manohar +# Apache 2.0 + +. cmd.sh +. path.sh + +stage=-1 +train_stage=-10 + +. utils/parse_options.sh + +set -o pipefail +exp=exp/semisup_50k + +false && { +utils/subset_data_dir.sh --speakers data/train_sup 50000 data/train_sup50k || exit 1 +utils/subset_data_dir.sh --shortest data/train_sup50k 25000 data/train_sup50k_short || exit 1 +utils/subset_data_dir.sh --speakers data/train_sup50k 30000 data/train_sup50k_30k || exit 1; + +steps/train_mono.sh --nj 10 --cmd "$train_cmd" \ + data/train_sup50k_short data/lang $exp/mono0a || exit 1 + +steps/align_si.sh --nj 30 --cmd "$train_cmd" \ + data/train_sup50k_30k data/lang $exp/mono0a $exp/mono0a_ali || exit 1 + +steps/train_deltas.sh --cmd "$train_cmd" \ + 2500 20000 data/train_sup50k_30k data/lang $exp/mono0a_ali $exp/tri1 || exit 1 + +(utils/mkgraph.sh data/lang_test $exp/tri1 $exp/tri1/graph + steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ + $exp/tri1/graph data/dev $exp/tri1/decode_dev)& + +steps/align_si.sh --nj 30 --cmd "$train_cmd" \ + data/train_sup50k_30k data/lang $exp/tri1 $exp/tri1_ali || exit 1; + +steps/train_deltas.sh --cmd "$train_cmd" \ + 2500 20000 data/train_sup50k_30k data/lang $exp/tri1_ali $exp/tri2 || exit 1 + +(utils/mkgraph.sh data/lang_test $exp/tri2 $exp/tri2/graph + steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ + $exp/tri2/graph data/dev $exp/tri2/decode_dev)& + + steps/align_si.sh --nj 30 --cmd "$train_cmd" \ + data/train_sup50k data/lang $exp/tri2 $exp/tri2_ali || exit 1; + +steps/train_lda_mllt.sh --cmd "$train_cmd" \ + 4000 30000 data/train_sup50k data/lang $exp/tri2_ali $exp/tri3a || exit 1; + +(utils/mkgraph.sh data/lang_test $exp/tri3a $exp/tri3a/graph + steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ + $exp/tri3a/graph data/dev $exp/tri3a/decode_dev)& + +steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \ + data/train_sup50k data/lang $exp/tri3a $exp/tri3a_ali || exit 1; + +steps/train_sat.sh --cmd "$train_cmd" \ + 4000 50000 data/train_sup50k data/lang $exp/tri3a_ali $exp/tri4a || exit 1; + +( + utils/mkgraph.sh data/lang_test $exp/tri4a $exp/tri4a/graph + steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ + $exp/tri4a/graph data/dev $exp/tri4a/decode_dev +)& + +utils/combine_data.sh data/semisup50k_250k data/train_sup50k data/train_unsup250k || exit 1 + +local/semisup/chain/tuning/run_tdnn_50k.sh \ + --train-set train_sup50k \ + --nnet3-affix _semi50k_250k \ + --chain-affix _semi50k_250k \ + --gmm tri4a \ + --stage $stage --train-stage $train_stage \ + --exp $exp \ + --ivector-train-set semisup50k_250k || exit 1 +} + +local/semisup/chain/tuning/run_tdnn_oracle.sh \ + --train-set semisup50k_250k \ + --nnet3-affix _semi50k_250k \ + --chain-affix _semi50k_250k_oracle \ + --gmm tri4a \ + --stage 9 --train-stage $train_stage \ + --exp $exp \ + --ivector-train-set semisup50k_250k || exit 1 From c5acc17cf6e65c66518b0fbdebd8c1bfed116215 Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Tue, 14 Nov 2017 14:31:53 -0500 Subject: [PATCH 094/174] semisup: Pocolm for fisher english --- .../s5/local/fisher_train_lms_pocolm.sh | 170 ++++++++++++++++++ egs/fisher_english/s5/local/run_unk_model.sh | 45 +++++ 2 files changed, 215 insertions(+) create mode 100755 egs/fisher_english/s5/local/fisher_train_lms_pocolm.sh create mode 100755 egs/fisher_english/s5/local/run_unk_model.sh diff --git a/egs/fisher_english/s5/local/fisher_train_lms_pocolm.sh b/egs/fisher_english/s5/local/fisher_train_lms_pocolm.sh new file mode 100755 index 00000000000..0152a64ae01 --- /dev/null +++ b/egs/fisher_english/s5/local/fisher_train_lms_pocolm.sh @@ -0,0 +1,170 @@ +#!/bin/bash + +# Copyright 2016 Vincent Nguyen +# 2016 Johns Hopkins University (author: Daniel Povey) +# 2017 Vimal Manohar +# Apache 2.0 +# +# It is based on the example scripts distributed with PocoLM + +set -e +stage=0 + +text=data/train/text +lexicon=data/local/dict/lexicon.txt +dir=data/local/pocolm + +num_ngrams_large=5000000 +num_ngrams_small=2500000 + +echo "$0 $@" # Print the command line for logging +. utils/parse_options.sh || exit 1; + +lm_dir=${dir}/data + +mkdir -p $dir +. ./path.sh || exit 1; # for KALDI_ROOT +export PATH=$KALDI_ROOT/tools/pocolm/scripts:$PATH +( # First make sure the pocolm toolkit is installed. + cd $KALDI_ROOT/tools || exit 1; + if [ -d pocolm ]; then + echo Not installing the pocolm toolkit since it is already there. + else + echo "$0: Please install the PocoLM toolkit with: " + echo " cd ../../../tools; extras/install_pocolm.sh; cd -" + exit 1; + fi +) || exit 1; + +for f in "$text" "$lexicon"; do + [ ! -f $x ] && echo "$0: No such file $f" && exit 1; +done + +num_dev_sentences=10000 + +#bypass_metaparam_optim_opt= +# If you want to bypass the metaparameter optimization steps with specific metaparameters +# un-comment the following line, and change the numbers to some appropriate values. +# You can find the values from output log of train_lm.py. +# These example numbers of metaparameters is for 4-gram model (with min-counts) +# running with train_lm.py. +# The dev perplexity should be close to the non-bypassed model. +#bypass_metaparam_optim_opt="--bypass-metaparameter-optimization=0.854,0.0722,0.5808,0.338,0.166,0.015,0.999,0.6228,0.340,0.172,0.999,0.788,0.501,0.406" +# Note: to use these example parameters, you may need to remove the .done files +# to make sure the make_lm_dir.py be called and tain only 3-gram model +#for order in 3; do +#rm -f ${lm_dir}/${num_word}_${order}.pocolm/.done + +if [ $stage -le 0 ]; then + mkdir -p ${dir}/data + mkdir -p ${dir}/data/text + + echo "$0: Getting the Data sources" + + rm ${dir}/data/text/* 2>/dev/null || true + + cleantext=$dir/text_all.gz + + cut -d ' ' -f 2- $text | awk -v lex=$lexicon ' + BEGIN{ + while((getline0) { seen[$1]=1; } + } + { + for(n=1; n<=NF;n++) { + if (seen[$n]) { + printf("%s ", $n); + } else { + printf(" "); + } + } + printf("\n"); + }' | gzip -c > $cleantext || exit 1; + + # This is for reporting perplexities + gunzip -c $dir/text_all.gz | head -n $num_dev_sentences > \ + ${dir}/data/test.txt + + # use a subset of the annotated training data as the dev set . + # Note: the name 'dev' is treated specially by pocolm, it automatically + # becomes the dev set. + gunzip -c $dir/text_all.gz | tail -n +$[num_dev_sentences+1] | \ + head -n $num_dev_sentences > ${dir}/data/text/dev.txt + + gunzip -c $dir/text_all.gz | tail -n +$[2*num_dev_sentences+1] > \ + ${dir}/data/text/train.txt + + # for reporting perplexities, we'll use the "real" dev set. + # (a subset of the training data is used as ${dir}/data/text/dev.txt to work + # out interpolation weights. + # note, we can't put it in ${dir}/data/text/, because then pocolm would use + # it as one of the data sources. + cut -d " " -f 2- < data/dev_and_test/text > ${dir}/data/real_dev_set.txt + + cat $lexicon | awk '{print $1}' | sort | uniq | awk ' + { + if ($1 == "") { + print " is in the vocabulary!" | "cat 1>&2" + exit 1; + } + if ($1 == "") { + print " is in the vocabulary!" | "cat 1>&2" + exit 1; + } + printf("%s\n", $1); + }' > $dir/data/wordlist || exit 1; +fi + +order=4 +wordlist=${dir}/data/wordlist + +lm_name="`basename ${wordlist}`_${order}" +min_counts='train=1' +if [ -n "${min_counts}" ]; then + lm_name+="_`echo ${min_counts} | tr -s "[:blank:]" "_" | tr "=" "-"`" +fi + +unpruned_lm_dir=${lm_dir}/${lm_name}.pocolm + +if [ $stage -le 1 ]; then + # decide on the vocabulary. + # Note: you'd use --wordlist if you had a previously determined word-list + # that you wanted to use. + # Note: if you have more than one order, use a certain amount of words as the + # vocab and want to restrict max memory for 'sort', + echo "$0: training the unpruned LM" + train_lm.py --wordlist=${wordlist} --num-splits=10 --warm-start-ratio=20 \ + --limit-unk-history=true \ + --fold-dev-into=train ${bypass_metaparam_optim_opt} \ + --min-counts="${min_counts}" \ + ${dir}/data/text ${order} ${lm_dir}/work ${unpruned_lm_dir} + + get_data_prob.py ${dir}/data/test.txt ${unpruned_lm_dir} 2>&1 | grep -F '[perplexity' | tee ${unpruned_lm_dir}/perplexity_test.log + + get_data_prob.py ${dir}/data/real_dev_set.txt ${unpruned_lm_dir} 2>&1 | grep -F '[perplexity' | tee ${unpruned_lm_dir}/perplexity_real_dev_set.log +fi + +if [ $stage -le 2 ]; then + echo "$0: pruning the LM (to larger size)" + # Using 5 million n-grams for a big LM for rescoring purposes. + prune_lm_dir.py --target-num-ngrams=$num_ngrams_large --initial-threshold=0.02 ${unpruned_lm_dir} ${dir}/data/lm_${order}_prune_big + + get_data_prob.py ${dir}/data/test.txt ${dir}/data/lm_${order}_prune_big 2>&1 | grep -F '[perplexity' | tee ${dir}/data/lm_${order}_prune_big/perplexity_test.log + + get_data_prob.py ${dir}/data/real_dev_set.txt ${dir}/data/lm_${order}_prune_big 2>&1 | grep -F '[perplexity' | tee ${dir}/data/lm_${order}_prune_big/perplexity_real_dev_set.log + + mkdir -p ${dir}/data/arpa + format_arpa_lm.py ${dir}/data/lm_${order}_prune_big | gzip -c > ${dir}/data/arpa/${order}gram_big.arpa.gz +fi + +if [ $stage -le 3 ]; then + echo "$0: pruning the LM (to smaller size)" + # Using 3 million n-grams for a smaller LM for graph building. Prune from the + # bigger-pruned LM, it'll be faster. + prune_lm_dir.py --target-num-ngrams=$num_ngrams_small ${dir}/data/lm_${order}_prune_big ${dir}/data/lm_${order}_prune_small + + get_data_prob.py ${dir}/data/test.txt ${dir}/data/lm_${order}_prune_small 2>&1 | grep -F '[perplexity' | tee ${dir}/data/lm_${order}_prune_small/perplexity_test.log + + get_data_prob.py ${dir}/data/real_dev_set.txt ${dir}/data/lm_${order}_prune_small 2>&1 | grep -F '[perplexity' | tee ${dir}/data/lm_${order}_prune_small/perplexity_real_dev_set.log + + format_arpa_lm.py ${dir}/data/lm_${order}_prune_small | gzip -c > ${dir}/data/arpa/${order}gram_small.arpa.gz +fi diff --git a/egs/fisher_english/s5/local/run_unk_model.sh b/egs/fisher_english/s5/local/run_unk_model.sh new file mode 100755 index 00000000000..6b2aca76495 --- /dev/null +++ b/egs/fisher_english/s5/local/run_unk_model.sh @@ -0,0 +1,45 @@ +#!/bin/bash + +utils/lang/make_unk_lm.sh data/local/dict exp/unk_lang_model + +utils/prepare_lang.sh --unk-fst exp/unk_lang_model/unk_fst.txt \ + data/local/dict "" data/local/lang data/lang_unk + +# note: it's important that the LM we built in data/lang/G.fst was created using +# pocolm with the option --limit-unk-history=true (see ted_train_lm.sh). This +# keeps the graph compact after adding the unk model (we only have to add one +# copy of it). + +cp data/lang/G.fst data/lang_unk/G.fst + +utils/mkgraph.sh data/lang_unk exp/tri3 exp/tri3/graph_unk + +. ./cmd.sh + +## Caution: if you use this unk-model stuff, be sure that the scoring script +## does not use lattice-align-words-lexicon, because it's not compatible with +## the unk-model. Instead you should use lattice-align-words (of course, this +## only works if you have position-dependent phones). + +decode_nj=30 +for dset in dev test; do + steps/decode_fmllr.sh --nj $decode_nj --cmd "$decode_cmd" --num-threads 4 \ + exp/tri3/graph_unk data/${dset} exp/tri3/decode_${dset}_unk + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \ + data/${dset} exp/tri3/decode_${dset}_unk exp/tri3/decode_${dset}_unk_rescore +done + +# # for x in exp/tri3/decode*; do grep Sum $x/*/*ys | utils/best_wer.sh ; done | grep -v old | grep -v si + +# # dev results. unk-model helps slightly before rescoring. +# %WER 19.3 | 507 17783 | 83.7 11.6 4.7 3.0 19.3 91.5 | -0.076 | exp/tri3/decode_dev/score_17_0.0/ctm.filt.filt.sys +# %WER 18.2 | 507 17783 | 84.8 10.7 4.5 3.0 18.2 91.3 | -0.111 | exp/tri3/decode_dev_rescore/score_16_0.0/ctm.filt.filt.sys +# %WER 19.1 | 507 17783 | 83.7 11.3 5.1 2.8 19.1 91.9 | -0.044 | exp/tri3/decode_dev_unk/score_17_0.0/ctm.filt.filt.sys +# %WER 18.2 | 507 17783 | 84.5 10.6 4.9 2.8 18.2 91.5 | -0.047 | exp/tri3/decode_dev_unk_rescore/score_15_0.0/ctm.filt.filt.sys + + +# # dev results. unk-model helps slightly after rescoring. +# %WER 17.3 | 1155 27500 | 85.0 11.5 3.5 2.4 17.3 86.9 | -0.035 | exp/tri3/decode_test/score_15_0.0/ctm.filt.filt.sys +# %WER 16.6 | 1155 27500 | 85.8 11.0 3.2 2.4 16.6 86.4 | -0.098 | exp/tri3/decode_test_rescore/score_14_0.0/ctm.filt.filt.sys +# %WER 17.3 | 1155 27500 | 84.9 11.3 3.8 2.2 17.3 87.4 | -0.015 | exp/tri3/decode_test_unk/score_15_0.0/ctm.filt.filt.sys +# %WER 16.5 | 1155 27500 | 85.7 10.7 3.6 2.2 16.5 86.7 | -0.075 | exp/tri3/decode_test_unk_rescore/score_14_0.0/ctm.filt.filt.sys From f71741a1cffae2b6ea9322b6f8d85aa7e98c43e0 Mon Sep 17 00:00:00 2001 From: System User Date: Fri, 17 Nov 2017 14:48:57 -0500 Subject: [PATCH 095/174] semisup: Fix lattice rescoring --- .../lmrescore_const_arpa_undeterminized.sh | 23 ++++++++++++++----- .../lmrescore_rnnlm_lat_undeterminized.sh | 10 ++++---- 2 files changed, 22 insertions(+), 11 deletions(-) diff --git a/egs/wsj/s5/steps/lmrescore_const_arpa_undeterminized.sh b/egs/wsj/s5/steps/lmrescore_const_arpa_undeterminized.sh index f487d135696..fbc99e195d6 100755 --- a/egs/wsj/s5/steps/lmrescore_const_arpa_undeterminized.sh +++ b/egs/wsj/s5/steps/lmrescore_const_arpa_undeterminized.sh @@ -14,6 +14,8 @@ scoring_opts= write_compact=true acwt=0.1 beam=8.0 # beam used in determinization +remove_nonG_graph_costs=false + # End configuration section. echo "$0 $@" # Print the command line for logging @@ -55,18 +57,27 @@ mkdir -p $outdir/log nj=`cat $indir/num_jobs` || exit 1; cp $indir/num_jobs $outdir +lats_rspecifier="ark:gunzip -c $indir/lat.JOB.gz |" +lattice_copy_cmd="lattice-lmrescore --lm-scale=-1.0 ark:- \"$oldlmcommand\" ark:- |" +if $remove_nonG_graph_costs; then + lats_rspecifier="$lats_rspecifier lattice-scale --write-compact=$write_compact --lm-scale=0.0 ark:- ark:- |" + lattice_copy_cmd= +fi + +lats_wspecifier="ark:| gzip -c > $outdir/lat.JOB.gz" + if [ $stage -le 1 ]; then $cmd JOB=1:$nj $outdir/log/rescorelm.JOB.log \ lattice-determinize-pruned --acoustic-scale=$acwt --beam=$beam \ "ark:gunzip -c $indir/lat.JOB.gz |" ark:- \| \ - lattice-scale --lm-scale=0.0 ark:- ark:- \| \ - lattice-lmrescore --lm-scale=-1.0 ark:- "$oldlmcommand" ark:- \| \ + lattice-scale --lm-scale=0.0 --acoustic-scale=0.0 ark:- ark:- \| \ + $lattice_copy_cmd \ lattice-lmrescore-const-arpa --lm-scale=1.0 \ ark:- "$newlm" ark:- \| \ - lattice-interp --alpha=0.5 --alpha-acoustic=1.0 --write-compact=$write_compact \ - "ark:gunzip -c $indir/lat.JOB.gz |" ark,s,cs:- ark:- \| \ - lattice-scale --lm-scale=2.0 --write-compact=$write_compact ark:- \ - "ark:| gzip -c > $outdir/lat.JOB.gz" || exit 1 + lattice-project ark:- ark:- \| \ + lattice-compose --write-compact=$write_compact \ + "$lats_rspecifier" \ + ark,s,cs:- "$lats_wspecifier" || exit 1 fi if ! $skip_scoring && [ $stage -le 2 ]; then diff --git a/egs/wsj/s5/steps/lmrescore_rnnlm_lat_undeterminized.sh b/egs/wsj/s5/steps/lmrescore_rnnlm_lat_undeterminized.sh index 58217a75e75..d4f25347db5 100755 --- a/egs/wsj/s5/steps/lmrescore_rnnlm_lat_undeterminized.sh +++ b/egs/wsj/s5/steps/lmrescore_rnnlm_lat_undeterminized.sh @@ -109,16 +109,16 @@ if [ $stage -le 1 ]; then $cmd JOB=1:$nj $outdir/log/rescorelm.JOB.log \ lattice-determinize-pruned --acoustic-scale=$acwt ${beam:+--beam=$beam} \ "ark:gunzip -c $indir/lat.JOB.gz |" ark:- \| \ - lattice-scale --lm-scale=0.0 ark:- ark:- \| \ + lattice-scale --lm-scale=0.0 --acoustic-scale=0.0 ark:- ark:- \| \ $lmscore_removing_binary --lm-scale=$oldlm_weight \ ark:- "$oldlm" ark:- $lattice_expand_cmd \| \ $rescoring_binary $extra_arg --lm-scale=$weight \ --max-ngram-order=$max_ngram_order \ $first_arg $oldlang/words.txt ark:- "$rnnlm_dir/rnnlm" ark:- \| \ - lattice-interp --alpha=0.5 --alpha-acoustic=1.0 --write-compact=$write_compact \ - "ark:gunzip -c $indir/lat.JOB.gz |" ark,s,cs:- ark:- \| \ - lattice-scale --lm-scale=2.0 --write-compact=$write_compact ark:- \ - "ark:| gzip -c > $outdir/lat.JOB.gz" || exit 1 + lattice-project ark:- ark:- \| \ + lattice-compose --write-compact=$write_compact \ + "ark:gunzip -c $indir/lat.JOB.gz |" \ + ark,s,cs:- "ark:| gzip -c > $outdir/lat.JOB.gz" || exit 1 fi if ! $skip_scoring ; then From 5103952ce934708dd0fbe1de544140393561bc70 Mon Sep 17 00:00:00 2001 From: System User Date: Fri, 17 Nov 2017 16:22:50 -0500 Subject: [PATCH 096/174] semisup: Code changes for undeterminized lattices --- src/chainbin/nnet3-chain-split-and-get-egs.cc | 9 ++-- src/lat/lattice-functions.cc | 4 +- src/latbin/lattice-compose.cc | 48 +++++++++++++++---- src/latbin/lattice-interp.cc | 5 +- 4 files changed, 49 insertions(+), 17 deletions(-) diff --git a/src/chainbin/nnet3-chain-split-and-get-egs.cc b/src/chainbin/nnet3-chain-split-and-get-egs.cc index 800ef52861d..384433fac69 100644 --- a/src/chainbin/nnet3-chain-split-and-get-egs.cc +++ b/src/chainbin/nnet3-chain-split-and-get-egs.cc @@ -92,9 +92,10 @@ static bool ProcessFile(const chain::SupervisionOptions &sup_opts, chain::Supervision supervision_part; - sup_lat_splitter.GetFrameRangeSupervision(start_frame_subsampled, - num_frames_subsampled, - &supervision_part); + if (!sup_lat_splitter.GetFrameRangeSupervision(start_frame_subsampled, + num_frames_subsampled, + &supervision_part)) + return false; if (normalization_fst.NumStates() > 0 && !chain::AddWeightToSupervisionFst(normalization_fst, @@ -109,7 +110,7 @@ static bool ProcessFile(const chain::SupervisionOptions &sup_opts, int32 first_frame = 0; // we shift the time-indexes of all these parts so // that the supervised part starts from frame 0. - + NnetChainExample nnet_chain_eg; nnet_chain_eg.outputs.resize(1); diff --git a/src/lat/lattice-functions.cc b/src/lat/lattice-functions.cc index 9e59318e7e1..0930e3bf492 100644 --- a/src/lat/lattice-functions.cc +++ b/src/lat/lattice-functions.cc @@ -1675,7 +1675,7 @@ void ComputeAcousticScoresMap( acoustic_scores->clear(); std::vector state_times; - LatticeStateTimes(lat, &state_times); + LatticeStateTimes(lat, &state_times); // Assumes the input is top sorted KALDI_ASSERT(lat.Start() == 0); @@ -1729,7 +1729,7 @@ void ReplaceAcousticScoresFromMap( typedef Arc::Weight LatticeWeight; typedef Arc::StateId StateId; - fst::TopSort(lat); + TopSortLatticeIfNeeded(lat); std::vector state_times; LatticeStateTimes(*lat, &state_times); diff --git a/src/latbin/lattice-compose.cc b/src/latbin/lattice-compose.cc index b9b261f7d36..10ac9f199fd 100644 --- a/src/latbin/lattice-compose.cc +++ b/src/latbin/lattice-compose.cc @@ -22,6 +22,7 @@ #include "util/common-utils.h" #include "fstext/fstext-lib.h" #include "lat/kaldi-lattice.h" +#include "lat/lattice-functions.h" int main(int argc, char *argv[]) { try { @@ -46,8 +47,10 @@ int main(int argc, char *argv[]) { ParseOptions po(usage); + bool write_compact = true; int32 num_states_cache = 50000; int32 phi_label = fst::kNoLabel; // == -1 + po.Register("write-compact", &write_compact, "If true, write in normal (compact) form."); po.Register("phi-label", &phi_label, "If >0, the label on backoff arcs of the LM"); po.Register("num-states-cache", &num_states_cache, "Number of states we cache when mapping LM FST to lattice type. " @@ -67,9 +70,14 @@ int main(int argc, char *argv[]) { int32 n_done = 0, n_fail = 0; SequentialLatticeReader lattice_reader1(lats_rspecifier1); - // Write as compact lattice. - CompactLatticeWriter compact_lattice_writer(lats_wspecifier); + + CompactLatticeWriter compact_lattice_writer; + LatticeWriter lattice_writer; + if (write_compact) + compact_lattice_writer.Open(lats_wspecifier); + else + lattice_writer.Open(lats_wspecifier); if (ClassifyRspecifier(arg2, NULL, NULL) == kNoRspecifier) { std::string fst_rxfilename = arg2; @@ -94,6 +102,11 @@ int main(int argc, char *argv[]) { std::string key = lattice_reader1.Key(); KALDI_VLOG(1) << "Processing lattice for key " << key; Lattice lat1 = lattice_reader1.Value(); + // Compute a map from each (t, tid) to (sum_of_acoustic_scores, count) + unordered_map, std::pair, + PairHasher > acoustic_scores; + if (!write_compact) + ComputeAcousticScoresMap(lat1, &acoustic_scores); ArcSort(&lat1, fst::OLabelCompare()); Lattice composed_lat; if (phi_label > 0) PhiCompose(lat1, mapped_fst2, phi_label, &composed_lat); @@ -102,9 +115,16 @@ int main(int argc, char *argv[]) { KALDI_WARN << "Empty lattice for utterance " << key << " (incompatible LM?)"; n_fail++; } else { - CompactLattice clat; - ConvertLattice(composed_lat, &clat); - compact_lattice_writer.Write(key, clat); + if (write_compact) { + CompactLattice clat; + ConvertLattice(composed_lat, &clat); + compact_lattice_writer.Write(key, clat); + } else { + // Replace each arc (t, tid) with the averaged acoustic score from + // the computed map + ReplaceAcousticScoresFromMap(acoustic_scores, &composed_lat); + lattice_writer.Write(key, composed_lat); + } n_done++; } } @@ -137,6 +157,11 @@ int main(int argc, char *argv[]) { fst::ILabelCompare ilabel_comp; fst::ArcSort(&lat2, ilabel_comp); } + // Compute a map from each (t, tid) to (sum_of_acoustic_scores, count) + unordered_map, std::pair, + PairHasher > acoustic_scores; + if (!write_compact) + ComputeAcousticScoresMap(lat1, &acoustic_scores); Lattice lat_out; if (phi_label > 0) { @@ -149,9 +174,16 @@ int main(int argc, char *argv[]) { KALDI_WARN << "Empty lattice for utterance " << key << " (incompatible LM?)"; n_fail++; } else { - CompactLattice clat_out; - ConvertLattice(lat_out, &clat_out); - compact_lattice_writer.Write(key, clat_out); + if (write_compact) { + CompactLattice clat_out; + ConvertLattice(lat_out, &clat_out); + compact_lattice_writer.Write(key, clat_out); + } else { + // Replace each arc (t, tid) with the averaged acoustic score from + // the computed map + ReplaceAcousticScoresFromMap(acoustic_scores, &lat_out); + lattice_writer.Write(key, lat_out); + } n_done++; } } diff --git a/src/latbin/lattice-interp.cc b/src/latbin/lattice-interp.cc index ebb6bd40198..dcd851e5b73 100644 --- a/src/latbin/lattice-interp.cc +++ b/src/latbin/lattice-interp.cc @@ -117,14 +117,13 @@ int main(int argc, char *argv[]) { n_empty++; } else { n_success++; - // Replace each arc (t, tid) with the averaged acoustic score from - // the computed map - if (write_compact) { CompactLattice clat3; ConvertLattice(lat3, &clat3); compact_lattice_writer.Write(key, clat3); } else { + // Replace each arc (t, tid) with the averaged acoustic score from + // the computed map ReplaceAcousticScoresFromMap(acoustic_scores, &lat3); lattice_writer.Write(key, lat3); } From fc472c3ff9266f268aae5644a2eecfe63391b84c Mon Sep 17 00:00:00 2001 From: System User Date: Fri, 17 Nov 2017 16:26:56 -0500 Subject: [PATCH 097/174] semisup: Adding more recipes --- .../run_tdnn_100k_semisupervised_conf_c.sh | 10 +- .../run_tdnn_100k_semisupervised_conf_d.sh | 8 +- .../run_tdnn_100k_semisupervised_conf_e.sh | 463 +++++++++++++++++ .../run_tdnn_100k_semisupervised_conf_f.sh | 463 +++++++++++++++++ .../run_tdnn_100k_semisupervised_conf_g.sh | 461 +++++++++++++++++ .../run_tdnn_100k_semisupervised_conf_h.sh | 463 +++++++++++++++++ .../run_tdnn_11k_semisupervised_conf_n.sh | 12 +- .../semisup/chain/tuning/run_tdnn_15k_c.sh | 3 +- .../semisup/chain/tuning/run_tdnn_15k_d.sh | 5 +- .../semisup/chain/tuning/run_tdnn_15k_e.sh | 1 + .../semisup/chain/tuning/run_tdnn_15k_f.sh | 1 + .../semisup/chain/tuning/run_tdnn_15k_g.sh | 178 +++++++ .../semisup/chain/tuning/run_tdnn_15k_h.sh | 178 +++++++ .../semisup/chain/tuning/run_tdnn_15k_i.sh | 197 ++++++++ .../semisup/chain/tuning/run_tdnn_15k_j.sh | 197 ++++++++ .../run_tdnn_15k_semisupervised_conf_aa.sh | 14 +- .../run_tdnn_15k_semisupervised_conf_ab.sh | 11 +- .../run_tdnn_15k_semisupervised_conf_ac.sh | 452 +++++++++++++++++ .../run_tdnn_15k_semisupervised_conf_ad.sh | 452 +++++++++++++++++ .../run_tdnn_15k_semisupervised_conf_ae.sh | 449 +++++++++++++++++ .../run_tdnn_15k_semisupervised_conf_af.sh | 449 +++++++++++++++++ .../run_tdnn_15k_semisupervised_conf_ag.sh | 469 ++++++++++++++++++ .../run_tdnn_15k_semisupervised_conf_ah.sh | 451 +++++++++++++++++ .../run_tdnn_15k_semisupervised_conf_ai.sh | 449 +++++++++++++++++ .../run_tdnn_15k_semisupervised_conf_aj.sh | 460 +++++++++++++++++ .../{run_tdnn_100k.sh => run_tdnn_50k_c.sh} | 32 +- .../run_tdnn_50k_semisupervised_conf_h.sh | 9 +- .../run_tdnn_50k_semisupervised_conf_i.sh | 3 +- .../run_tdnn_50k_semisupervised_conf_j.sh | 7 +- .../run_tdnn_50k_semisupervised_conf_k.sh | 453 +++++++++++++++++ .../s5/local/semisup/run_100k.sh | 2 +- .../s5/local/semisup/run_15k_unk.sh | 74 +++ .../s5/local/semisup/run_50k.sh | 4 +- .../steps/nnet3/multilingual/combine_egs.sh | 2 + 34 files changed, 6832 insertions(+), 50 deletions(-) create mode 100644 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_conf_e.sh create mode 100644 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_conf_f.sh create mode 100644 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_conf_g.sh create mode 100644 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_conf_h.sh create mode 100644 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_g.sh create mode 100644 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_h.sh create mode 100755 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_i.sh create mode 100755 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_j.sh create mode 100644 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_ac.sh create mode 100644 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_ad.sh create mode 100644 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_ae.sh create mode 100644 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_af.sh create mode 100644 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_ag.sh create mode 100644 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_ah.sh create mode 100644 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_ai.sh create mode 100644 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_aj.sh rename egs/fisher_english/s5/local/semisup/chain/tuning/{run_tdnn_100k.sh => run_tdnn_50k_c.sh} (91%) create mode 100755 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_k.sh create mode 100644 egs/fisher_english/s5/local/semisup/run_15k_unk.sh diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_conf_c.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_conf_c.sh index 7790f004b64..a7121dcb8dd 100644 --- a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_conf_c.sh +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_conf_c.sh @@ -7,6 +7,7 @@ # Unsupervised weight: 1.0 # Weights for phone LM (supervised, unsupervises): 3,2 # LM for decoding unsupervised data: 4gram +# Supervision: Naive split lattices set -u -e -o pipefail @@ -36,7 +37,7 @@ graph_affix=_sup100k # can be used to decode the unsup data with another lm/gr phone_insertion_penalty= # Semi-supervised options -comb_affix=comb1c # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +comb_affix=comb1c2 # affix for new chain-model directory trained on the combined supervised+unsupervised subsets supervision_weights=1.0,1.0 lm_weights=3,2 sup_egs_dir= @@ -139,14 +140,13 @@ for dset in $unsupervised_set; do steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \ --acwt 1.0 --post-decode-acwt 10.0 --write-compact false --skip-scoring true \ --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_sp_hires \ - --scoring-opts "--min-lmwt 10 --max-lmwt 10" --determinize-opts "--word-determinize=false" \ + --scoring-opts "--min-lmwt 10 --max-lmwt 10" --determinize-opts "--word-determinize=false" \ $graphdir data/${dset}_sp_hires $chaindir/decode_${dset}_sp${decode_affix} fi if [ $stage -le 6 ]; then - steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ - --write-compact false --read-determinized false --write-determinized false \ - --skip-scoring true \ + steps/lmrescore_const_arpa_undeterminized.sh --cmd "$decode_cmd" \ + --write-compact false --acwt 0.1 --beam 8.0 --skip-scoring true \ data/lang_test${graph_affix} \ data/lang_test${graph_affix}_fg data/${dset}_sp_hires \ $chaindir/decode_${dset}_sp${decode_affix} \ diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_conf_d.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_conf_d.sh index 4e2df5e5d16..31ac754ffef 100644 --- a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_conf_d.sh +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_conf_d.sh @@ -6,6 +6,7 @@ # Unsupervised weight: 1.0 # Weights for phone LM (supervised, unsupervises): 3,2 # LM for decoding unsupervised data: 4gram +# Supervision: Smart split lattices set -u -e -o pipefail @@ -35,7 +36,7 @@ graph_affix=_sup100k # can be used to decode the unsup data with another lm/gr phone_insertion_penalty= # Semi-supervised options -comb_affix=comb1d # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +comb_affix=comb1d2 # affix for new chain-model directory trained on the combined supervised+unsupervised subsets supervision_weights=1.0,1.0 lm_weights=3,2 sup_egs_dir= @@ -143,9 +144,8 @@ for dset in $unsupervised_set; do fi if [ $stage -le 6 ]; then - steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ - --write-compact false --read-determinized false --write-determinized false \ - --skip-scoring true \ + steps/lmrescore_const_arpa_undeterminized.sh --cmd "$decode_cmd" \ + --write-compact false --acwt 0.1 --beam 8.0 --skip-scoring true \ data/lang_test${graph_affix} \ data/lang_test${graph_affix}_fg data/${dset}_sp_hires \ $chaindir/decode_${dset}_sp${decode_affix} \ diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_conf_e.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_conf_e.sh new file mode 100644 index 00000000000..d98a0e599be --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_conf_e.sh @@ -0,0 +1,463 @@ +#!/bin/bash + +# This script is similar to _d, but uses 3gram LM with best path from 4gram LM. +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 3,2 +# LM for decoding unsupervised data: 3gram +# Supervision: Smart split lattices + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +exp=exp/semisup_100k + +supervised_set=train_sup +unsupervised_set=train_unsup100k_250k +semisup_train_set= # semisup100k_250k + +tdnn_affix=7c # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +nnet3_affix= # affix for nnet3 and chain dir -- relates to i-vector used + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +graph_affix=_sup100k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1e # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=3,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix=bi_a +unsup_egs_opts= +apply_deriv_weights=true + +do_finetuning=false + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size="150=128/300=64" +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}_fg/frame_subsampling_factor +fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix}_fg \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri4a_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs_split.sh --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}_fg/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${lang_test_suffix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_conf_f.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_conf_f.sh new file mode 100644 index 00000000000..11432a15b5c --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_conf_f.sh @@ -0,0 +1,463 @@ +#!/bin/bash + +# This is semi-supervised training with 500 hours of unsupervised data. +# This script is similar to _d, but with 500 hours sunsupervised data. +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 3,2 +# LM for decoding unsupervised data: 4gram +# Supervision: Smart split lattices + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +exp=exp/semisup_100k + +supervised_set=train_sup +unsupervised_set=train_unsup100k_500k +semisup_train_set= # semisup100k_250k + +tdnn_affix=7c # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +nnet3_affix= # affix for nnet3 and chain dir -- relates to i-vector used + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +graph_affix=_sup100k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1f # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=3,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix=bi_a +unsup_egs_opts= +apply_deriv_weights=true + +do_finetuning=false + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size="150=128/300=64" +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri4a_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs_split.sh --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${lang_test_suffix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_conf_g.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_conf_g.sh new file mode 100644 index 00000000000..f6b7027eeec --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_conf_g.sh @@ -0,0 +1,461 @@ +#!/bin/bash + +# This is semi-supervised training with 500 hours of unsupervised data. +# This script is similar to _f, but uses 3gram LM. +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 3,2 +# LM for decoding unsupervised data: 3gram +# Supervision: Smart split lattices + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +exp=exp/semisup_100k + +supervised_set=train_sup +unsupervised_set=train_unsup100k_500k +semisup_train_set= # semisup100k_250k + +tdnn_affix=7c # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +nnet3_affix= # affix for nnet3 and chain dir -- relates to i-vector used + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +graph_affix=_sup100k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1g # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=3,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix=bi_a +unsup_egs_opts= +apply_deriv_weights=true + +do_finetuning=false + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size="150=128/300=64" +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri4a_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs_split.sh --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${lang_test_suffix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_conf_h.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_conf_h.sh new file mode 100644 index 00000000000..ad83fbda0b7 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_conf_h.sh @@ -0,0 +1,463 @@ +#!/bin/bash + +# This is semi-supervised training with 500 hours of unsupervised data. +# This script is similar e, but with 500 hours sunsupervised data. +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 0.3 +# Weights for phone LM (supervised, unsupervises): 10,1 +# LM for decoding unsupervised data: 4gram +# Supervision: Smart split lattices + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +exp=exp/semisup_100k + +supervised_set=train_sup +unsupervised_set=train_unsup100k_500k +semisup_train_set= # semisup100k_250k + +tdnn_affix=7c # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +nnet3_affix= # affix for nnet3 and chain dir -- relates to i-vector used + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +graph_affix=_sup100k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1h # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,0.3 +lm_weights=10,1 +sup_egs_dir= +unsup_egs_dir= +tree_affix=bi_a +unsup_egs_opts= +apply_deriv_weights=true + +do_finetuning=false + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size="150=128/300=64" +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri4a_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs_split.sh --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${lang_test_suffix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_n.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_n.sh index dab7eb692d8..4d92f6df1e0 100644 --- a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_n.sh +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_n.sh @@ -50,7 +50,7 @@ extra_right_context=0 xent_regularize=0.1 hidden_dim=725 -minibatch_size=128 +minibatch_size="150=128/300=64" # to tune: # frames_per_eg for unsupervised @@ -88,7 +88,7 @@ fi if false && [ $stage -le 1 ]; then echo "$0: chain training on the supervised subset data/${supervised_set}" - local/chain/run_tdnn_11k.sh $train_supervised_opts --remove-egs false \ + local/chain/run_tdnn_15k.sh $train_supervised_opts --remove-egs false \ --train-set $supervised_set --ivector-train-set $base_train_set \ --nnet3-affix $nnet3_affix --tdnn-affix $tdnn_affix --exp $exp fi @@ -164,6 +164,12 @@ fi dir=$exp/chain${nnet3_affix}/tdnn${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} if [ $stage -le 10 ]; then + steps/subset_ali_dir.sh --cmd "$train_cmd" \ + data/${unsupervised_set} data/${unsupervised_set}_sp_hires \ + $chaindir/best_path_${unsupervised_set}_sp${decode_affix} \ + $chaindir/best_path_${unsupervised_set}${decode_affix} + echo $frame_subsampling_factor > $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ $dir @@ -323,7 +329,7 @@ if [ $stage -le 15 ]; then --chain.lm-opts="--num-extra-lm-states=2000" \ --egs.opts "--frames-overlap-per-eg 0" \ --egs.chunk-width 150 \ - --trainer.num-chunk-per-minibatch "150=128/300=64" \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ --trainer.frames-per-iter 1500000 \ --trainer.num-epochs 4 \ --trainer.optimization.num-jobs-initial 3 \ diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_c.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_c.sh index 3d09d9ee4ab..43f3505d545 100755 --- a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_c.sh +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_c.sh @@ -1,7 +1,8 @@ #!/bin/bash set -e -# This is fisher chain recipe for training a model on a subset of around 10 hours. +# This is fisher chain recipe for training a model on a subset of around 15 hours. +# This is based on run_tdnn_11k.sh, but uses a chunk width of 160,140,110,80. # configs for 'chain' stage=0 diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_d.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_d.sh index 16d9f12acf4..1541d8c8e02 100755 --- a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_d.sh +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_d.sh @@ -1,7 +1,8 @@ #!/bin/bash set -e -# This is fisher chain recipe for training a model on a subset of around 10 hours. +# This is fisher chain recipe for training a model on a subset of around 15 hours. +# This is similar to _c, but uses a biphone tree with up to 7000 leaves. # configs for 'chain' stage=0 @@ -11,7 +12,7 @@ get_egs_stage=-10 decode_iter= train_set=train_sup15k ivector_train_set=semisup15k_250k -tree_affix= +tree_affix=bi_d nnet3_affix=_semi15k_250k chain_affix=_semi15k_250k exp=exp/semisup_15k diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_e.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_e.sh index 4f077e74410..91d938e5f42 100755 --- a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_e.sh +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_e.sh @@ -2,6 +2,7 @@ set -e # This is fisher chain recipe for training a model on a subset of around 10 hours. +# This is similar to _d, but uses a biphone tree with up to 2000 leaves. # configs for 'chain' stage=0 diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_f.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_f.sh index 4dd2a38fd13..906d8eeca98 100755 --- a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_f.sh +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_f.sh @@ -2,6 +2,7 @@ set -e # This is fisher chain recipe for training a model on a subset of around 10 hours. +# This is similar to _d, but uses a biphone tree with up to 4000 leaves. # configs for 'chain' stage=0 diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_g.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_g.sh new file mode 100644 index 00000000000..47160845f30 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_g.sh @@ -0,0 +1,178 @@ +#!/bin/bash +set -e + +# This is fisher chain recipe for training a model on a subset of around 15 hours. +# This is similar to _c, but uses a biphone tree with up to 7000 leaves. + +# configs for 'chain' +stage=0 +tdnn_affix=7g +train_stage=-10 +get_egs_stage=-10 +decode_iter= +train_set=train_sup15k +unsupervised_set=train_unsup250k_240k +semisup_set=semisup15k_250k +tree_affix=bi_g +nnet3_affix=_semi15k_250k +chain_affix=_semi15k_250k +exp=exp/semisup_15k +gmm=tri3 +xent_regularize=0.1 +hidden_dim=500 + +# training options +num_epochs=10 +remove_egs=false +common_egs_dir= +minibatch_size=128 + +orig_treedir=exp/semisup_15k/chain_semi15k_250k/tree_bi_d +unsup_alidir=exp/semisup_15k/chain_semi15k_250k/tdnn7d_sp/best_path_train_unsup250k_240k_unphdet_ex250k_fg + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn5 dim=$hidden_dim target-rms=0.5 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn5 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + mkdir -p $dir/egs + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$common_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize 0.1 \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true" \ + --egs.chunk-width 160,140,110,80 \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +decode_suff= +if [ $stage -le 15 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}${decode_suff} || exit 1; + ) & + done +fi +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_h.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_h.sh new file mode 100644 index 00000000000..057513707f8 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_h.sh @@ -0,0 +1,178 @@ +#!/bin/bash +set -e + +# This is fisher chain recipe for training a model on a subset of around 15 hours. +# This is similar to _c, but uses a biphone tree with up to 7000 leaves. + +# configs for 'chain' +stage=0 +tdnn_affix=7h +train_stage=-10 +get_egs_stage=-10 +decode_iter= +train_set=train_sup15k +unsupervised_set=train_unsup250k_240k +semisup_set=semisup15k_250k +tree_affix=bi_h +nnet3_affix=_semi15k_250k +chain_affix=_semi15k_250k +exp=exp/semisup_15k +gmm=tri3 +xent_regularize=0.1 +hidden_dim=500 + +# training options +num_epochs=10 +remove_egs=false +common_egs_dir= +minibatch_size=128 + +sup_alidir=exp/semisup_15k/chain_semi15k_250k/tri3_train_sup15k_sp_ali +unsup_alidir=exp/semisup_15k/chain_semi15k_250k/tdnn7d_sp/best_path_train_unsup250k_240k_sp_unphdet_ex250k_fg + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn5 dim=$hidden_dim target-rms=0.5 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn5 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + mkdir -p $dir/egs + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$common_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize 0.1 \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true" \ + --egs.chunk-width 160,140,110,80 \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +decode_suff= +if [ $stage -le 15 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}${decode_suff} || exit 1; + ) & + done +fi +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_i.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_i.sh new file mode 100755 index 00000000000..4ab3fa480d5 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_i.sh @@ -0,0 +1,197 @@ +#!/bin/bash +set -e + +# This is fisher chain recipe for training a model on a subset of around 15 hours. +# This is similar to _d, but uses a phone LM UNK model + +# configs for 'chain' +stage=0 +tdnn_affix=7i +train_stage=-10 +get_egs_stage=-10 +decode_iter= +train_set=train_sup15k +ivector_train_set=semisup15k_250k +tree_affix=bi_i +nnet3_affix=_semi15k_250k +chain_affix=_semi15k_250k +exp=exp/semisup_15k +gmm=tri3 +xent_regularize=0.1 +hidden_dim=500 + +# training options +num_epochs=10 +remove_egs=false +common_egs_dir= +minibatch_size=128 + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate -1 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 7000 data/${train_set}_sp $lang $lat_dir $treedir || exit 1 +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn5 dim=$hidden_dim target-rms=0.5 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn5 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + mkdir -p $dir/egs + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$common_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize 0.1 \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true" \ + --egs.chunk-width 160,140,110,80 \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph_poco_unk +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_poco_test_unk $dir $graph_dir +fi + +decode_suff= +if [ $stage -le 15 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}${decode_suff} || exit 1; + ) & + done +fi +wait; +exit 0; diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_j.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_j.sh new file mode 100755 index 00000000000..0c39841d7ef --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_j.sh @@ -0,0 +1,197 @@ +#!/bin/bash +set -e + +# This is fisher chain recipe for training a model on a subset of around 15 hours. +# This is similar to _d, but uses a speed-perturbed data for tree building + +# configs for 'chain' +stage=0 +tdnn_affix=7j +train_stage=-10 +get_egs_stage=-10 +decode_iter= +train_set=train_sup15k +ivector_train_set=semisup15k_250k +tree_affix=bi_j +nnet3_affix=_semi15k_250k +chain_affix=_semi15k_250k +exp=exp/semisup_15k +gmm=tri3 +xent_regularize=0.1 +hidden_dim=500 + +# training options +num_epochs=10 +remove_egs=false +common_egs_dir= +minibatch_size=128 + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate -1 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 7000 data/${train_set}_sp $lang $lat_dir $treedir || exit 1 +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn5 dim=$hidden_dim target-rms=0.5 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn5 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + mkdir -p $dir/egs + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$common_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize 0.1 \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true" \ + --egs.chunk-width 160,140,110,80 \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph_poco +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_poco_test $dir $graph_dir +fi + +decode_suff= +if [ $stage -le 15 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}${decode_suff} || exit 1; + ) & + done +fi +wait; +exit 0; diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_aa.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_aa.sh index 7e7a734e4fb..c02005540f5 100644 --- a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_aa.sh +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_aa.sh @@ -1,11 +1,13 @@ #!/bin/bash # This script is same as _z, but uses 7d as seed model and bi_d tree. -# unsup_frames_per_eg=150 +# sup_frames_per_eg=160,140,110,80 +# unsup_frames_per_eg=160,140,110,80 # Deriv weights: Lattice posterior of best path pdf # Unsupervised weight: 1.0 # Weights for phone LM (supervised, unsupervises): 5,2 # LM for decoding unsupervised data: 4gram +# Supervision: Smart split lattices set -u -e -o pipefail @@ -26,7 +28,9 @@ train_supervised_opts="--stage -10 --train-stage -10" # Unsupervised options decode_affix=_unphdet egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir -unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +unsup_frames_per_eg= # if empty will be equal to the supervised model's config unless sup_frames_per_eg + # -- you will need to change minibatch_size for comb training accordingly +sup_frames_per_eg= lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data tolerance=1 @@ -128,8 +132,8 @@ for dset in $unsupervised_set; do fi if [ $stage -le 6 ]; then - steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" --write-compact false \ - --read-determinized false --write-determinized false --acwt 0.1 --beam 8.0 \ + steps/lmrescore_const_arpa_undeterminized.sh --cmd "$decode_cmd" \ + --write-compact false --acwt 0.1 --beam 8.0 \ data/lang_test${graph_affix} \ data/lang_test${graph_affix}_fg data/${dset}_sp_hires \ $chaindir/decode_${dset}_sp${decode_affix} \ @@ -330,7 +334,7 @@ if [ $stage -le 15 ]; then --chain.apply-deriv-weights $apply_deriv_weights \ --chain.lm-opts="--num-extra-lm-states=2000" \ --egs.opts "--frames-overlap-per-eg 0" \ - --egs.chunk-width 150 \ + --egs.chunk-width $frames_per_eg \ --trainer.num-chunk-per-minibatch "$minibatch_size" \ --trainer.frames-per-iter 1500000 \ --trainer.num-epochs 4 \ diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_ab.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_ab.sh index f3626571de9..50ed5f5ee6b 100644 --- a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_ab.sh +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_ab.sh @@ -1,11 +1,13 @@ #!/bin/bash # This script is same as _z, but does rescoring correctly. +# sup_frames_per_eg=150 # unsup_frames_per_eg=150 # Deriv weights: Lattice posterior of best path pdf # Unsupervised weight: 1.0 # Weights for phone LM (supervised, unsupervises): 5,2 # LM for decoding unsupervised data: 4gram +# Supervision: Smart split lattices set -u -e -o pipefail @@ -20,13 +22,15 @@ unsupervised_set=train_unsup250k # set this to your choice of unsupervised data supervised_set=train_sup15k semi_affix=semi15k_250k # affix relating train-set splitting proportion -tdnn_affix=7d # affix for the supervised chain-model directory +tdnn_affix=7b # affix for the supervised chain-model directory train_supervised_opts="--stage -10 --train-stage -10" # Unsupervised options decode_affix=_unphdet egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir -unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +unsup_frames_per_eg= # if empty will be equal to the supervised model's config unless sup_frames_per_eg + # -- you will need to change minibatch_size for comb training accordingly +sup_frames_per_eg= lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data tolerance=1 @@ -244,6 +248,7 @@ sup_lat_dir=$exp/chain${nnet3_affix}/tri3_${supervised_set}_lats if [ -z "$sup_egs_dir" ]; then sup_egs_dir=$dir/egs_${supervised_set} frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + [ ! -z "$sup_frames_per_eg" ] && frames_per_eg=$sup_frames_per_eg if [ $stage -le 12 ]; then if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then @@ -384,7 +389,7 @@ fi if [ $stage -le 19 ]; then mkdir -p ${dir}${finetune_suffix} - + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 done diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_ac.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_ac.sh new file mode 100644 index 00000000000..973bbf93c6a --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_ac.sh @@ -0,0 +1,452 @@ +#!/bin/bash + +# This script is same as _ab, but uses frames_per_eg 150. This is same as _z with rescoring done correctly. +# sup_frames_per_eg=150 +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 5,2 +# LM for decoding unsupervised data: 4gram +# Supervision: Smart split lattices + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup15k_250k # for reference +exp=exp/semisup_15k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup15k +semi_affix=semi15k_250k # affix relating train-set splitting proportion + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix=_unphdet +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg=150 # if empty will be equal to the supervised model's config unless sup_frames_per_eg + # -- you will need to change minibatch_size for comb training accordingly +sup_frames_per_eg=150 +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1ac # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix= +unsup_egs_opts= +apply_deriv_weights=true + +do_finetuning=false + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size="150=128/300=64" +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +remove_egs=true + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +tree_affix= + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri3_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + [ ! -z "$sup_frames_per_eg" ] && frames_per_eg=$sup_frames_per_eg + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + ! $remove_egs && touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + ! $remove_egs && touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs_split.sh --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${lang_test_suffix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_ad.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_ad.sh new file mode 100644 index 00000000000..9b0c9e8560c --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_ad.sh @@ -0,0 +1,452 @@ +#!/bin/bash + +# This script is same as _ac, but uses naive splitting. +# sup_frames_per_eg=150 +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 5,2 +# LM for decoding unsupervised data: 4gram +# Supervision: Naive split lattices + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup15k_250k # for reference +exp=exp/semisup_15k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup15k +semi_affix=semi15k_250k # affix relating train-set splitting proportion + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix=_unphdet +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg=150 # if empty will be equal to the supervised model's config unless sup_frames_per_eg + # -- you will need to change minibatch_size for comb training accordingly +sup_frames_per_eg=150 +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1ad # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix= +unsup_egs_opts= +apply_deriv_weights=true + +do_finetuning=false + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size="150=128/300=64" +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +remove_egs=true + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +tree_affix= + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri3_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + [ ! -z "$sup_frames_per_eg" ] && frames_per_eg=$sup_frames_per_eg + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + ! $remove_egs && touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + ! $remove_egs && touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${lang_test_suffix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_ae.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_ae.sh new file mode 100644 index 00000000000..68ac5bc51f9 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_ae.sh @@ -0,0 +1,449 @@ +#!/bin/bash + +# This script is same as _aa, but uses frames-per-eg of 150 +# sup_frames_per_eg=150 +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 5,2 +# LM for decoding unsupervised data: 4gram +# Supervision: Smart split lattices + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup15k_250k # for reference +exp=exp/semisup_15k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup15k +semi_affix=semi15k_250k # affix relating train-set splitting proportion + +tdnn_affix=7d # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix=_unphdet +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg=150 # if empty will be equal to the supervised model's config unless sup_frames_per_eg + # -- you will need to change minibatch_size for comb training accordingly +sup_frames_per_eg=150 +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1ae # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix=bi_d +unsup_egs_opts= +apply_deriv_weights=true + +do_finetuning=false + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size="150=128/300=64" +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri3_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + [ ! -z "$sup_frames_per_eg" ] && frames_per_eg=$sup_frames_per_eg + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs_split.sh --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${lang_test_suffix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_af.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_af.sh new file mode 100644 index 00000000000..9e6814c3328 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_af.sh @@ -0,0 +1,449 @@ +#!/bin/bash + +# This script is same as _ac, but uses naive splitting. +# sup_frames_per_eg=150 +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 5,2 +# LM for decoding unsupervised data: 4gram +# Supervision: Naive split lattices + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup15k_250k # for reference +exp=exp/semisup_15k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup15k +semi_affix=semi15k_250k # affix relating train-set splitting proportion + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix=_unphdet +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg=150 # if empty will be equal to the supervised model's config unless sup_frames_per_eg + # -- you will need to change minibatch_size for comb training accordingly +sup_frames_per_eg=150 +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1af # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix= +unsup_egs_opts= +apply_deriv_weights=true + +do_finetuning=false + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size="150=128/300=64" +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri3_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + [ ! -z "$sup_frames_per_eg" ] && frames_per_eg=$sup_frames_per_eg + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${lang_test_suffix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_ag.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_ag.sh new file mode 100644 index 00000000000..2d86b39564d --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_ag.sh @@ -0,0 +1,469 @@ +#!/bin/bash + +# This script is same as _z, but does rescoring correctly. +# sup_frames_per_eg=150 +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 5,2 +# LM for decoding unsupervised data: 4gram +# Supervision: Smart split lattices + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup15k_250k # for reference +exp=exp/semisup_15k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup15k +semi_affix=semi15k_250k # affix relating train-set splitting proportion + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix=_unphdet +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config unless sup_frames_per_eg + # -- you will need to change minibatch_size for comb training accordingly +sup_frames_per_eg= +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1ag # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix= +unsup_egs_opts= +apply_deriv_weights=true + +do_finetuning=false + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size="150=128/300=64" +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +tree_affix= + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri3_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + [ ! -z "$sup_frames_per_eg" ] && frames_per_eg=$sup_frames_per_eg + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs_split.sh --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${lang_test_suffix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_ah.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_ah.sh new file mode 100644 index 00000000000..9e2a3bb088d --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_ah.sh @@ -0,0 +1,451 @@ +#!/bin/bash + +# This script is same as _ae, but uses naive splitting. +# sup_frames_per_eg=150 +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 5,2 +# LM for decoding unsupervised data: 4gram +# Supervision: Naive split lattices + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup15k_250k # for reference +exp=exp/semisup_15k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup15k +semi_affix=semi15k_250k # affix relating train-set splitting proportion + +tdnn_affix=7d # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix=_unphdet +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg=150 # if empty will be equal to the supervised model's config unless sup_frames_per_eg + # -- you will need to change minibatch_size for comb training accordingly +sup_frames_per_eg=150 +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1ah # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix=bi_d +unsup_egs_opts= +apply_deriv_weights=true + +do_finetuning=false + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size="150=128/300=64" +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri3_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + [ ! -z "$sup_frames_per_eg" ] && frames_per_eg=$sup_frames_per_eg + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${lang_test_suffix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_ai.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_ai.sh new file mode 100644 index 00000000000..10e92661b16 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_ai.sh @@ -0,0 +1,449 @@ +#!/bin/bash + +# This script is same as _ae, but uses larger 7h tree +# sup_frames_per_eg=150 +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 5,2 +# LM for decoding unsupervised data: 4gram +# Supervision: Smart split lattices + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup15k_250k # for reference +exp=exp/semisup_15k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup15k +semi_affix=semi15k_250k # affix relating train-set splitting proportion + +tdnn_affix=7h # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix=_unphdet +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg=150 # if empty will be equal to the supervised model's config unless sup_frames_per_eg + # -- you will need to change minibatch_size for comb training accordingly +sup_frames_per_eg=150 +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1ai # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix=bi_h +unsup_egs_opts= +apply_deriv_weights=true + +do_finetuning=false + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size="150=128/300=64" +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri3_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + [ ! -z "$sup_frames_per_eg" ] && frames_per_eg=$sup_frames_per_eg + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs_split.sh --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${lang_test_suffix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_aj.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_aj.sh new file mode 100644 index 00000000000..8c13d94e6bb --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_aj.sh @@ -0,0 +1,460 @@ +#!/bin/bash + +# This script is same as _ae, but uses 7i model and tree with UNK decoding +# sup_frames_per_eg=150 +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 5,2 +# LM for decoding unsupervised data: 4gram +# Supervision: Smart split lattices + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup15k_250k # for reference +exp=exp/semisup_15k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup15k +semi_affix=semi15k_250k # affix relating train-set splitting proportion + +tdnn_affix=7i # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix=_unphdet +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg=150 # if empty will be equal to the supervised model's config unless sup_frames_per_eg + # -- you will need to change minibatch_size for comb training accordingly +sup_frames_per_eg=150 +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +graph_affix=_poco_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1aj # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix=bi_i +unsup_egs_opts= +apply_deriv_weights=true + +do_finetuning=false + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size="150=128/300=64" +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri3_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + [ ! -z "$sup_frames_per_eg" ] && frames_per_eg=$sup_frames_per_eg + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs_split.sh --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +test_graph_dir=$dir/graph${test_graph_affix} +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 ${test_lang} $dir $test_graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $test_graph_dir data/${decode_set}_hires \ + $dir/decode${test_graph_affix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $test_graph_dir data/${decode_set}_hires \ + $dir/decode${test_graph_affix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_c.sh similarity index 91% rename from egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k.sh rename to egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_c.sh index 904ed588bd3..f84d4715f7c 100755 --- a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k.sh +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_c.sh @@ -1,26 +1,26 @@ #!/bin/bash set -e -# This is fisher chain recipe for training a model on a subset of around 10 hours. +# This is fisher chain recipe for training a model on a subset of around 50 hours. # configs for 'chain' stage=0 -tdnn_affix=7b +tdnn_affix=7c train_stage=-10 get_egs_stage=-10 decode_iter= -train_set=train_sup -ivector_train_set=train_sup +train_set=train_sup50k +ivector_train_set=semisup50k_250k tree_affix= -nnet3_affix= -chain_affix= -exp=exp/semisup_100k +nnet3_affix=_semi50k_250k +chain_affix=_semi50k_250k +exp=exp/semisup_50k gmm=tri4a xent_regularize=0.1 -hidden_dim=725 +hidden_dim=500 # training options -num_epochs=4 +num_epochs=8 remove_egs=false common_egs_dir= minibatch_size=128 @@ -83,7 +83,8 @@ if [ $stage -le 11 ]; then # Build a tree using our new topology. steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ --leftmost-questions-truncate -1 \ - --cmd "$train_cmd" 4000 data/${train_set} $lang $gmm_dir $treedir || exit 1 + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 7000 data/${train_set} $lang $gmm_dir $treedir || exit 1 fi if [ $stage -le 12 ]; then @@ -107,10 +108,11 @@ if [ $stage -le 12 ]; then relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim - relu-batchnorm-layer name=tdnn5 input=Append(-6,-3,0) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim ## adding the layers for chain branch - relu-batchnorm-layer name=prefinal-chain input=tdnn5 dim=$hidden_dim target-rms=0.5 + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 # adding the layers for xent branch @@ -122,7 +124,7 @@ if [ $stage -le 12 ]; then # final-layer learns at a rate independent of the regularization # constant; and the 0.5 was tuned so as to make the relative progress # similar in the xent and regular final layers. - relu-batchnorm-layer name=prefinal-xent input=tdnn5 dim=$hidden_dim target-rms=0.5 + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 EOF @@ -135,6 +137,7 @@ if [ $stage -le 13 ]; then /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage fi + mkdir -p $dir/egs touch $dir/egs/.nodelete # keep egs around when that run dies. steps/nnet3/chain/train.py --stage $train_stage \ @@ -149,7 +152,7 @@ if [ $stage -le 13 ]; then --chain.lm-opts="--num-extra-lm-states=2000" \ --egs.stage $get_egs_stage \ --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true" \ - --egs.chunk-width 150 \ + --egs.chunk-width 160,140,110,80 \ --trainer.num-chunk-per-minibatch $minibatch_size \ --trainer.frames-per-iter 1500000 \ --trainer.num-epochs $num_epochs \ @@ -191,4 +194,3 @@ if [ $stage -le 15 ]; then fi wait; exit 0; - diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_h.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_h.sh index c1fb23eb970..4aa1105c7c6 100755 --- a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_h.sh +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_h.sh @@ -1,11 +1,12 @@ #!/bin/bash -# This script is same as _g, but split lattice supervision +# This script is same as _g, but smart splitting. # unsup_frames_per_eg=150 # Deriv weights: Lattice posterior of best path pdf # Unsupervised weight: 1.0 # Weights for phone LM (supervised, unsupervises): 3,2 # LM for decoding unsupervised data: 4gram +# Supervision: Smart split lattices set -u -e -o pipefail @@ -34,7 +35,7 @@ graph_affix=_ex250k # can be used to decode the unsup data with another lm/gra phone_insertion_penalty= # Semi-supervised options -comb_affix=comb1h # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +comb_affix=comb1h2 # affix for new chain-model directory trained on the combined supervised+unsupervised subsets supervision_weights=1.0,1.0 lm_weights=3,2 sup_egs_dir= @@ -131,8 +132,8 @@ for dset in $unsupervised_set; do fi if [ $stage -le 5 ]; then - steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" --write-compact false \ - --read-determinized false --write-determinized false --acwt 0.1 --beam 8.0 \ + steps/lmrescore_const_arpa_undeterminized.sh --cmd "$decode_cmd" \ + --write-compact false --acwt 0.1 --beam 8.0 --skip-scoring true \ data/lang_test${graph_affix} \ data/lang_test${graph_affix}_fg data/${dset}_sp_hires \ $chaindir/decode_${dset}_sp${decode_affix} \ diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_i.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_i.sh index a114d09463a..27fe0476727 100755 --- a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_i.sh +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_i.sh @@ -6,6 +6,7 @@ # Unsupervised weight: 1.0 # Weights for phone LM (supervised, unsupervises): 3,2 # LM for decoding unsupervised data: 3gram +# Supervision: Smart split lattices set -u -e -o pipefail @@ -34,7 +35,7 @@ graph_affix=_ex250k # can be used to decode the unsup data with another lm/gra phone_insertion_penalty= # Semi-supervised options -comb_affix=comb1h # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +comb_affix=comb1i # affix for new chain-model directory trained on the combined supervised+unsupervised subsets supervision_weights=1.0,1.0 lm_weights=3,2 sup_egs_dir= diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_j.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_j.sh index 123464c34ab..bfca03d9de9 100755 --- a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_j.sh +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_j.sh @@ -1,11 +1,12 @@ #!/bin/bash -# This script is same as _h, but uses 3-gram LM. +# This script is same as _i, but uses best path and weights from 4gram. # unsup_frames_per_eg=150 # Deriv weights: Lattice posterior of best path pdf # Unsupervised weight: 1.0 # Weights for phone LM (supervised, unsupervises): 3,2 -# LM for decoding unsupervised data: 4gram +# LM for decoding unsupervised data: 3gram +# Supervision: Smart split lattices set -u -e -o pipefail @@ -132,7 +133,7 @@ for dset in $unsupervised_set; do if [ $stage -le 5 ]; then steps/lmrescore_const_arpa_undeterminized.sh --cmd "$decode_cmd" \ - --write-compact false --acwt 0.1 --beam 8.0 \ + --write-compact false --acwt 0.1 --beam 8.0 --skip-scoring true \ data/lang_test${graph_affix} \ data/lang_test${graph_affix}_fg data/${dset}_sp_hires \ $chaindir/decode_${dset}_sp${decode_affix} \ diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_k.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_k.sh new file mode 100755 index 00000000000..1c78277cb7f --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_k.sh @@ -0,0 +1,453 @@ +#!/bin/bash + +# This script is same as _j, but uses 4gram LM. +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 3,2 +# LM for decoding unsupervised data: 4gram +# Supervision: Naive split lattices + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup50k_250k # for reference +exp=exp/semisup_50k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup50k +semi_affix=semi50k_250k # affix relating train-set splitting proportion + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix=_undet +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1k # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=3,2 +sup_egs_dir= +unsup_egs_dir= +comb_egs_dir= +tree_affix= +unsup_egs_opts= +apply_deriv_weights=true + +do_finetuning=false + +extra_left_context=0 +extra_right_context=0 + +train_extra_opts= + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size="150=128/300=64" +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +tree_affix= + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri4a_${supervised_set}_lats +if [ -z "$comb_egs_dir" ] && [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + if [ -z "$comb_egs_dir" ]; then + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) + fi +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} + +if [ -z "$comb_egs_dir" ] && [ -z "$unsup_egs_dir" ]; then + [ -z $unsup_frames_per_eg ] && [ ! -z "$frames_per_eg" ] && unsup_frames_per_eg=$frames_per_eg + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +if [ -z "$comb_egs_dir" ]; then + comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + + if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. + fi +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir ${train_extra_opts} || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${lang_test_suffix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/run_100k.sh b/egs/fisher_english/s5/local/semisup/run_100k.sh index b7a6f7c0f61..05e6622c582 100644 --- a/egs/fisher_english/s5/local/semisup/run_100k.sh +++ b/egs/fisher_english/s5/local/semisup/run_100k.sh @@ -70,7 +70,7 @@ steps/train_sat.sh --cmd "$train_cmd" \ utils/copy_data_dir.sh data/train_unsup250k data/train_unsup100k_250k utils/combine_data.sh data/semisup100k_250k data/train_sup \ - data/train_unsup250k || exit 1 + data/train_unsup100k_250k || exit 1 } local/semisup/chain/tuning/run_tdnn_100k.sh \ diff --git a/egs/fisher_english/s5/local/semisup/run_15k_unk.sh b/egs/fisher_english/s5/local/semisup/run_15k_unk.sh new file mode 100644 index 00000000000..41590dd9fe2 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/run_15k_unk.sh @@ -0,0 +1,74 @@ +#!/bin/bash + +# Copyright 2017 Vimal Manohar +# Apache 2.0 + +. cmd.sh +. path.sh + +stage=-1 +train_stage=-10 + +. utils/parse_options.sh + +set -o pipefail +exp=exp/semisup_15k + +false && { +utils/subset_data_dir.sh --speakers data/train_sup 15000 data/train_sup15k || exit 1 +utils/subset_data_dir.sh --shortest data/train_sup15k 5000 data/train_sup15k_short || exit 1 +utils/subset_data_dir.sh data/train_sup15k 7500 data/train_sup15k_half || exit 1 + +steps/train_mono.sh --nj 10 --cmd "$train_cmd" \ + data/train_sup15k_short data/lang $exp/mono0a || exit 1 + +steps/align_si.sh --nj 30 --cmd "$train_cmd" \ + data/train_sup15k_half data/lang $exp/mono0a $exp/mono0a_ali || exit 1 + +steps/train_deltas.sh --cmd "$train_cmd" \ + 2000 10000 data/train_sup15k_half data/lang $exp/mono0a_ali $exp/tri1 || exit 1 + +(utils/mkgraph.sh data/lang_test $exp/tri1 $exp/tri1/graph + steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ + $exp/tri1/graph data/dev $exp/tri1/decode_dev)& + +steps/align_si.sh --nj 30 --cmd "$train_cmd" \ + data/train_sup15k data/lang $exp/tri1 $exp/tri1_ali || exit 1; + +steps/train_lda_mllt.sh --cmd "$train_cmd" \ + 2500 15000 data/train_sup15k data/lang $exp/tri1_ali $exp/tri2 || exit 1; + +(utils/mkgraph.sh data/lang_test $exp/tri2 $exp/tri2/graph + steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ + $exp/tri2/graph data/dev $exp/tri2/decode_dev)& + +steps/align_si.sh --nj 30 --cmd "$train_cmd" \ + data/train_sup15k data/lang $exp/tri2 $exp/tri2_ali || exit 1; + +steps/train_sat.sh --cmd "$train_cmd" \ + 2500 15000 data/train_sup15k data/lang $exp/tri2_ali $exp/tri3 || exit 1; + +( + utils/mkgraph.sh data/lang_test $exp/tri3 $exp/tri3/graph + steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ + $exp/tri3/graph data/dev $exp/tri3/decode_dev +)& + +utils/combine_data.sh data/semisup15k_250k data/train_sup15k data/train_unsup250k || exit 1 + +local/semisup/chain/tuning/run_tdnn_11k.sh \ + --train-set train_sup15k \ + --nnet3-affix _semi15k_250k \ + --chain-affix _semi15k_250k \ + --stage $stage --train-stage $train_stage \ + --exp $exp \ + --ivector-train-set semisup15k_250k || exit 1 +} + +false && local/semisup/chain/tuning/run_tdnn_oracle.sh \ + --train-set semisup15k_250k \ + --nnet3-affix _semi15k_250k \ + --chain-affix _semi15k_250k_oracle \ + --stage 9 --train-stage $train_stage \ + --exp $exp \ + --ivector-train-set semisup15k_250k || exit 1 diff --git a/egs/fisher_english/s5/local/semisup/run_50k.sh b/egs/fisher_english/s5/local/semisup/run_50k.sh index eeb6f3fd3c4..e69997cba2c 100644 --- a/egs/fisher_english/s5/local/semisup/run_50k.sh +++ b/egs/fisher_english/s5/local/semisup/run_50k.sh @@ -14,7 +14,6 @@ train_stage=-10 set -o pipefail exp=exp/semisup_50k -false && { utils/subset_data_dir.sh --speakers data/train_sup 50000 data/train_sup50k || exit 1 utils/subset_data_dir.sh --shortest data/train_sup50k 25000 data/train_sup50k_short || exit 1 utils/subset_data_dir.sh --speakers data/train_sup50k 30000 data/train_sup50k_30k || exit 1; @@ -42,7 +41,7 @@ steps/train_deltas.sh --cmd "$train_cmd" \ steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ $exp/tri2/graph data/dev $exp/tri2/decode_dev)& - steps/align_si.sh --nj 30 --cmd "$train_cmd" \ +steps/align_si.sh --nj 30 --cmd "$train_cmd" \ data/train_sup50k data/lang $exp/tri2 $exp/tri2_ali || exit 1; steps/train_lda_mllt.sh --cmd "$train_cmd" \ @@ -74,7 +73,6 @@ local/semisup/chain/tuning/run_tdnn_50k.sh \ --stage $stage --train-stage $train_stage \ --exp $exp \ --ivector-train-set semisup50k_250k || exit 1 -} local/semisup/chain/tuning/run_tdnn_oracle.sh \ --train-set semisup50k_250k \ diff --git a/egs/wsj/s5/steps/nnet3/multilingual/combine_egs.sh b/egs/wsj/s5/steps/nnet3/multilingual/combine_egs.sh index f899a50e58f..75a49e1004e 100755 --- a/egs/wsj/s5/steps/nnet3/multilingual/combine_egs.sh +++ b/egs/wsj/s5/steps/nnet3/multilingual/combine_egs.sh @@ -89,6 +89,8 @@ for lang in $(seq 0 $[$num_langs-1]);do this_frames_per_eg=$(cat ${args[$lang]}/info/frames_per_eg | \ awk -F, '{for (i=1; i<=NF; i++) sum += $i;} END{print int(sum / NF)}') # use average frames-per-eg + # frames_per_eg_list stores the average frames-per-eg for each language. + # The average does not have to be exact. if [ $lang -eq 0 ]; then frames_per_eg_list="$this_frames_per_eg" else From 010bc4e1494eed60a95b018a91021a783a0b5df4 Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Fri, 17 Nov 2017 16:28:20 -0500 Subject: [PATCH 098/174] semisup: Unk model on Fisher --- egs/fisher_english/s5/local/run_unk_model.sh | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/egs/fisher_english/s5/local/run_unk_model.sh b/egs/fisher_english/s5/local/run_unk_model.sh index 6b2aca76495..924203c5ff7 100755 --- a/egs/fisher_english/s5/local/run_unk_model.sh +++ b/egs/fisher_english/s5/local/run_unk_model.sh @@ -1,6 +1,8 @@ #!/bin/bash -utils/lang/make_unk_lm.sh data/local/dict exp/unk_lang_model +# Copyright 2017 Vimal Manohar + +utils/lang/make_unk_lm.sh data/local/dict exp/unk_lang_model || exit 1 utils/prepare_lang.sh --unk-fst exp/unk_lang_model/unk_fst.txt \ data/local/dict "" data/local/lang data/lang_unk @@ -10,7 +12,15 @@ utils/prepare_lang.sh --unk-fst exp/unk_lang_model/unk_fst.txt \ # keeps the graph compact after adding the unk model (we only have to add one # copy of it). -cp data/lang/G.fst data/lang_unk/G.fst +mkdir -p data/lang_poco_test_unk +cp -r data/lang_unk/* data/lang_poco_test_unk +cp data/lang_poco_test/G.fst data/lang_poco_test_unk/G.fst + +mkdir -p data/lang_poco_test_ex250k_unk +cp -r data/lang_unk/* data/lang_poco_test_ex250k_unk +cp data/lang_poco_test_ex250k/G.fst data/lang_poco_test_ex250k_unk/G.fst + +exit 0 utils/mkgraph.sh data/lang_unk exp/tri3 exp/tri3/graph_unk From d43125bb217f84fa109748184e7df454e77a64b8 Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Fri, 17 Nov 2017 16:28:52 -0500 Subject: [PATCH 099/174] semisup: Bug fix in ivectors in semi-supervised scenario --- .../semisup/nnet3/run_ivector_common_pca.sh | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/egs/fisher_english/s5/local/semisup/nnet3/run_ivector_common_pca.sh b/egs/fisher_english/s5/local/semisup/nnet3/run_ivector_common_pca.sh index 99648a93f08..aae97e145d5 100755 --- a/egs/fisher_english/s5/local/semisup/nnet3/run_ivector_common_pca.sh +++ b/egs/fisher_english/s5/local/semisup/nnet3/run_ivector_common_pca.sh @@ -2,7 +2,7 @@ . ./cmd.sh set -e -stage=1 +stage=-1 speed_perturb=true train_set=train @@ -26,8 +26,10 @@ if [ -z "$unsup_train_set" ] && [ ! -z "$semisup_train_set" ]; then fi if [ ! -z "$unsup_train_set" ]; then - utils/combine_data.sh data/$semisup_train_set \ - data/$train_set data/$unsup_train_set + if [ $stage -le 0 ]; then + utils/combine_data.sh data/$semisup_train_set \ + data/$train_set data/$unsup_train_set + fi fi # perturbed data preparation @@ -50,9 +52,11 @@ if [ "$speed_perturb" == "true" ]; then fi fi -if [ ! -z "$unsup_train_set" ]; then - utils/combine_data.sh data/${semisup_train_set}_sp \ - data/${train_set}_sp data/${unsup_train_set}_sp +if [ $stage -le 2 ]; then + if [ ! -z "$unsup_train_set" ]; then + utils/combine_data.sh data/${semisup_train_set}_sp \ + data/${train_set}_sp data/${unsup_train_set}_sp + fi fi if [ $stage -le 3 ]; then From 82efedb04c72b50a1ce050ca07dccdb2cbabd96f Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Mon, 20 Nov 2017 17:19:47 -0500 Subject: [PATCH 100/174] semisup: Minor fixes to scripts --- .../run_tdnn_100k_semisupervised_conf_e.sh | 2 + .../run_tdnn_100k_semisupervised_conf_f.sh | 6 ++- .../run_tdnn_100k_semisupervised_conf_g.sh | 2 + .../run_tdnn_100k_semisupervised_conf_h.sh | 42 +++++++++++-------- .../run_tdnn_15k_semisupervised_conf_aj.sh | 4 +- egs/wsj/s5/steps/best_path_weights.sh | 22 +++++++--- 6 files changed, 51 insertions(+), 27 deletions(-) diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_conf_e.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_conf_e.sh index d98a0e599be..2ffa5320d0f 100644 --- a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_conf_e.sh +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_conf_e.sh @@ -1,6 +1,8 @@ #!/bin/bash # This script is similar to _d, but uses 3gram LM with best path from 4gram LM. + +# Unsupervised set: train_unsup100k_250k # unsup_frames_per_eg=150 # Deriv weights: Lattice posterior of best path pdf # Unsupervised weight: 1.0 diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_conf_f.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_conf_f.sh index 11432a15b5c..5caf0bbb00c 100644 --- a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_conf_f.sh +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_conf_f.sh @@ -2,6 +2,8 @@ # This is semi-supervised training with 500 hours of unsupervised data. # This script is similar to _d, but with 500 hours sunsupervised data. + +# Unsupervised set: train_unsup100k_500k # unsup_frames_per_eg=150 # Deriv weights: Lattice posterior of best path pdf # Unsupervised weight: 1.0 @@ -151,9 +153,9 @@ for dset in $unsupervised_set; do data/lang_test${graph_affix}_fg data/${dset}_sp_hires \ $chaindir/decode_${dset}_sp${decode_affix} \ $chaindir/decode_${dset}_sp${decode_affix}_fg - - ln -sf ../final.mdl $chaindir/decode_${dset}_sp${decode_affix}_fg/final.mdl fi + + ln -sf ../final.mdl $chaindir/decode_${dset}_sp${decode_affix}_fg/final.mdl done decode_affix=${decode_affix}_fg diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_conf_g.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_conf_g.sh index f6b7027eeec..e62c52fe907 100644 --- a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_conf_g.sh +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_conf_g.sh @@ -2,6 +2,8 @@ # This is semi-supervised training with 500 hours of unsupervised data. # This script is similar to _f, but uses 3gram LM. + +# Unsupervised set: train_unsup100k_500k # unsup_frames_per_eg=150 # Deriv weights: Lattice posterior of best path pdf # Unsupervised weight: 1.0 diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_conf_h.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_conf_h.sh index ad83fbda0b7..99311ab9887 100644 --- a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_conf_h.sh +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_conf_h.sh @@ -1,13 +1,15 @@ #!/bin/bash # This is semi-supervised training with 500 hours of unsupervised data. -# This script is similar e, but with 500 hours sunsupervised data. +# This script is similar _g, but Naive split lattices + +# Unsupervised set: train_unsup100k_500k # unsup_frames_per_eg=150 # Deriv weights: Lattice posterior of best path pdf -# Unsupervised weight: 0.3 -# Weights for phone LM (supervised, unsupervises): 10,1 -# LM for decoding unsupervised data: 4gram -# Supervision: Smart split lattices +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 3,2 +# LM for decoding unsupervised data: 3gram +# Supervision: Naive split lattices set -u -e -o pipefail @@ -35,11 +37,12 @@ lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting e tolerance=1 graph_affix=_sup100k # can be used to decode the unsup data with another lm/graph phone_insertion_penalty= +rescore_unsup_lattices=false # Semi-supervised options comb_affix=comb1h # affix for new chain-model directory trained on the combined supervised+unsupervised subsets -supervision_weights=1.0,0.3 -lm_weights=10,1 +supervision_weights=1.0,1.0 +lm_weights=3,2 sup_egs_dir= unsup_egs_dir= tree_affix=bi_a @@ -144,19 +147,24 @@ for dset in $unsupervised_set; do $graphdir data/${dset}_sp_hires $chaindir/decode_${dset}_sp${decode_affix} fi - if [ $stage -le 6 ]; then - steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ - --write-compact false --acwt 0.1 --beam 8.0 --skip-scoring true \ - data/lang_test${graph_affix} \ - data/lang_test${graph_affix}_fg data/${dset}_sp_hires \ - $chaindir/decode_${dset}_sp${decode_affix} \ - $chaindir/decode_${dset}_sp${decode_affix}_fg - + if $rescore_unsup_lattices; then + if [ $stage -le 6 ]; then + steps/lmrescore_const_arpa_undeterminized.sh --cmd "$decode_cmd" \ + --write-compact false --acwt 0.1 --beam 8.0 --skip-scoring true \ + data/lang_test${graph_affix} \ + data/lang_test${graph_affix}_fg data/${dset}_sp_hires \ + $chaindir/decode_${dset}_sp${decode_affix} \ + $chaindir/decode_${dset}_sp${decode_affix}_fg + fi ln -sf ../final.mdl $chaindir/decode_${dset}_sp${decode_affix}_fg/final.mdl + else + ln -sf ../final.mdl $chaindir/decode_${dset}_sp${decode_affix}/final.mdl fi done -decode_affix=${decode_affix}_fg +if $rescore_unsup_lattices; then + decode_affix=${decode_affix}_fg +fi if [ $stage -le 8 ]; then steps/best_path_weights.sh --cmd "${train_cmd}" --acwt 0.1 \ @@ -305,7 +313,7 @@ if [ -z "$unsup_egs_dir" ]; then touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. echo "$0: generating egs from the unsupervised data" - steps/nnet3/chain/get_egs_split.sh --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ --left-tolerance $tolerance --right-tolerance $tolerance \ --left-context $left_context --right-context $right_context \ --left-context-initial $left_context_initial --right-context-final $right_context_final \ diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_aj.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_aj.sh index 8c13d94e6bb..1affa2a4a34 100644 --- a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_aj.sh +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_aj.sh @@ -133,7 +133,7 @@ for dset in $unsupervised_set; do if [ $stage -le 4 ]; then echo "$0: getting the decoding lattices for the unsupervised subset using the chain model at: $chaindir" steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \ - --acwt 1.0 --post-decode-acwt 10.0 --write-compact false \ + --acwt 1.0 --post-decode-acwt 10.0 --write-compact false --skip-scoring true \\ --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ --scoring-opts "--min-lmwt 10 --max-lmwt 10" --determinize-opts "--word-determinize=false" \ $graphdir data/${dset}_sp_hires $chaindir/decode_${dset}_sp${decode_affix} @@ -141,7 +141,7 @@ for dset in $unsupervised_set; do if [ $stage -le 6 ]; then steps/lmrescore_const_arpa_undeterminized.sh --cmd "$decode_cmd" \ - --write-compact false --acwt 0.1 --beam 8.0 \ + --write-compact false --acwt 0.1 --beam 8.0 --skip-scoring true \ ${unsup_decode_lang} ${unsup_rescore_lang} \ data/${dset}_sp_hires \ $chaindir/decode_${dset}_sp${decode_affix} \ diff --git a/egs/wsj/s5/steps/best_path_weights.sh b/egs/wsj/s5/steps/best_path_weights.sh index d39d367191f..73414695e70 100755 --- a/egs/wsj/s5/steps/best_path_weights.sh +++ b/egs/wsj/s5/steps/best_path_weights.sh @@ -41,8 +41,8 @@ acwt=0.1 write_words=false #end configuration section. -help_message="Usage: "$(basename $0)" [options] [:weight] [:weight] [[:weight] ... ] - E.g. "$(basename $0)" data/train_unt.seg data/lang exp/tri1/decode:0.5 exp/tri2/decode:0.25 exp/tri3/decode:0.25 exp/combine +help_message="Usage: "$0" [options] [:weight] [:weight] [[:weight] ... ] + E.g. "$0" data/train_unt.seg data/lang exp/tri1/decode:0.5 exp/tri2/decode:0.25 exp/tri3/decode:0.25 exp/combine Options: --cmd (run.pl|queue.pl...) # specify how to run the sub-processes. "; @@ -85,7 +85,11 @@ if [ $stage -lt -1 ]; then "$words_wspecifier" "ark:| gzip -c > $dir/ali.JOB.gz" || exit 1 fi -src_dir=`dirname $decode_dir` +if [ -f `dirname $decode_dir`/final.mdl ]; then + src_dir=`dirname $decode_dir` +else + src_dir=$decode_dir +fi cp $src_dir/cmvn_opts $dir/ || exit 1 for f in final.mat splice_opts frame_subsampling_factor; do @@ -116,9 +120,15 @@ fdir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print for i in `seq 0 $[num_sys-1]`; do if [ $stage -lt $i ]; then decode_dir=`echo ${decode_dirs[$i]} | cut -d: -f1` + if [ -f `dirname $decode_dir`/final.mdl ]; then + # model one level up from decode dir + this_srcdir=`dirname $decode_dir` + else + this_srcdir=$decode_dir + fi - model=`dirname $decode_dir`/final.mdl # model one level up from decode dir - tree=`dirname $decode_dir`/tree # tree one level up from decode dir + model=$this_srcdir/final.mdl + tree=$this_srcdir/tree for f in $model $decode_dir/lat.1.gz $tree; do [ ! -f $f ] && echo "$0: expecting file $f to exist" && exit 1; @@ -153,7 +163,7 @@ if [ $stage -lt $num_sys ]; then vector-sum $file_list ark:- \| \ vector-scale --scale=$inv_weights_sum ark:- \ ark,scp:$fdir/weights.JOB.ark,$fdir/weights.JOB.scp || exit 1 - + for n in `seq $nj`; do cat $dir/weights.$n.scp done > $dir/weights.scp From e3b7d7208246cff03c3ccc68669cd14de9295d2f Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Tue, 28 Nov 2017 18:18:54 -0500 Subject: [PATCH 101/174] semisup-smbr: Re-organizing stuff --- .../run_tdnn_15k_semisupervised_conf_ai.sh | 22 +- .../run_tdnn_15k_semisupervised_conf_aj.sh | 19 +- .../run_tdnn_15k_semisupervised_conf_ak.sh | 471 +++++++++++++++++ .../run_tdnn_15k_semisupervised_conf_al.sh | 471 +++++++++++++++++ .../run_tdnn_15k_semisupervised_conf_am.sh | 470 +++++++++++++++++ .../run_tdnn_15k_semisupervised_conf_an.sh | 471 +++++++++++++++++ .../run_tdnn_15k_semisupervised_conf_ao.sh | 471 +++++++++++++++++ .../semisup/chain/tuning/run_tdnn_50k_c.sh | 22 +- .../semisup/chain/tuning/run_tdnn_50k_d.sh | 208 ++++++++ .../semisup/chain/tuning/run_tdnn_50k_e.sh | 208 ++++++++ .../run_tdnn_50k_semisupervised_conf_l.sh | 472 ++++++++++++++++++ .../run_tdnn_50k_semisupervised_conf_m.sh | 472 ++++++++++++++++++ .../run_tdnn_50k_semisupervised_conf_n.sh | 472 ++++++++++++++++++ ...or_common_pca.sh => run_ivector_common.sh} | 0 .../s5/local/semisup/run_15k_unk.sh | 74 --- egs/wsj/s5/steps/nnet3/chain/get_egs.sh | 39 +- egs/wsj/s5/steps/nnet3/chain/get_egs_split.sh | 47 +- 17 files changed, 4259 insertions(+), 150 deletions(-) create mode 100644 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_ak.sh create mode 100644 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_al.sh create mode 100644 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_am.sh create mode 100644 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_an.sh create mode 100644 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_ao.sh create mode 100755 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_d.sh create mode 100755 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_e.sh create mode 100755 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_l.sh create mode 100755 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_m.sh create mode 100755 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_n.sh rename egs/fisher_english/s5/local/semisup/nnet3/{run_ivector_common_pca.sh => run_ivector_common.sh} (100%) delete mode 100644 egs/fisher_english/s5/local/semisup/run_15k_unk.sh diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_ai.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_ai.sh index 10e92661b16..3c08529e985 100644 --- a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_ai.sh +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_ai.sh @@ -46,6 +46,7 @@ unsup_egs_dir= tree_affix=bi_h unsup_egs_opts= apply_deriv_weights=true +rescore_unsup_lattices=true do_finetuning=false @@ -130,19 +131,24 @@ for dset in $unsupervised_set; do $graphdir data/${dset}_sp_hires $chaindir/decode_${dset}_sp${decode_affix} fi - if [ $stage -le 6 ]; then - steps/lmrescore_const_arpa_undeterminized.sh --cmd "$decode_cmd" \ - --write-compact false --acwt 0.1 --beam 8.0 \ - data/lang_test${graph_affix} \ - data/lang_test${graph_affix}_fg data/${dset}_sp_hires \ - $chaindir/decode_${dset}_sp${decode_affix} \ - $chaindir/decode_${dset}_sp${decode_affix}_fg + if $rescore_unsup_lattices; then + if [ $stage -le 6 ]; then + steps/lmrescore_const_arpa_undeterminized.sh --cmd "$decode_cmd" \ + --write-compact false --acwt 0.1 --beam 8.0 \ + data/lang_test${graph_affix} \ + data/lang_test${graph_affix}_fg data/${dset}_sp_hires \ + $chaindir/decode_${dset}_sp${decode_affix} \ + $chaindir/decode_${dset}_sp${decode_affix}_fg + fi ln -sf ../final.mdl $chaindir/decode_${dset}_sp${decode_affix}_fg/ || true + else + ln -sf ../final.mdl $chaindir/decode_${dset}_sp${decode_affix}/ || true fi done -decode_affix=${decode_affix}_fg +$rescore_unsup_lattices && decode_affix=${decode_affix}_fg + if [ $stage -le 8 ]; then steps/best_path_weights.sh --cmd "${train_cmd}" --acwt 0.1 \ data/${unsupervised_set}_sp_hires data/lang_chain \ diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_aj.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_aj.sh index 1affa2a4a34..cf6bcf82bc2 100644 --- a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_aj.sh +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_aj.sh @@ -34,7 +34,6 @@ sup_frames_per_eg=150 lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data tolerance=1 -graph_affix=_poco_ex250k # can be used to decode the unsup data with another lm/graph phone_insertion_penalty= # Semi-supervised options @@ -46,7 +45,7 @@ unsup_egs_dir= tree_affix=bi_i unsup_egs_opts= apply_deriv_weights=true - +use_smart_splitting=true do_finetuning=false extra_left_context=0 @@ -95,6 +94,12 @@ fi # --nnet3-affix $nnet3_affix --tdnn-affix $tdnn_affix --exp $exp #fi +if $use_smart_splitting; then + comb_affix=${comb_affix}_smart +else + comb_affix=${comb_affix}_naive +fi + lang=data/lang_chain_unk unsup_decode_lang=data/lang_poco_test_ex250k_unk unsup_rescore_lang=${unsup_decode_lang}_big @@ -133,7 +138,7 @@ for dset in $unsupervised_set; do if [ $stage -le 4 ]; then echo "$0: getting the decoding lattices for the unsupervised subset using the chain model at: $chaindir" steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \ - --acwt 1.0 --post-decode-acwt 10.0 --write-compact false --skip-scoring true \\ + --acwt 1.0 --post-decode-acwt 10.0 --write-compact false --skip-scoring true \ --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ --scoring-opts "--min-lmwt 10 --max-lmwt 10" --determinize-opts "--word-determinize=false" \ $graphdir data/${dset}_sp_hires $chaindir/decode_${dset}_sp${decode_affix} @@ -299,8 +304,14 @@ if [ -z "$unsup_egs_dir" ]; then mkdir -p $unsup_egs_dir touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + if $use_smart_splitting; then + get_egs_script=steps/nnet3/chain/get_egs_split.sh + else + get_egs_script=steps/nnet3/chain/get_egs.sh + fi + echo "$0: generating egs from the unsupervised data" - steps/nnet3/chain/get_egs_split.sh --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + $get_egs_script --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ --left-tolerance $tolerance --right-tolerance $tolerance \ --left-context $left_context --right-context $right_context \ --left-context-initial $left_context_initial --right-context-final $right_context_final \ diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_ak.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_ak.sh new file mode 100644 index 00000000000..eff2ab7bfa6 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_ak.sh @@ -0,0 +1,471 @@ +#!/bin/bash + +# This script is same as _aj, but uses 7i model and supervised lattices from UNK phone LM alignment. +# sup_frames_per_eg=150 +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 5,2 +# LM for decoding unsupervised data: 4gram +# Supervision: Smart split lattices + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup15k_250k # for reference +exp=exp/semisup_15k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup15k +semi_affix=semi15k_250k # affix relating train-set splitting proportion + +tdnn_affix=7i # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix=_unphdet +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg=150 # if empty will be equal to the supervised model's config unless sup_frames_per_eg + # -- you will need to change minibatch_size for comb training accordingly +sup_frames_per_eg=150 +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1ak # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix=bi_i +unsup_egs_opts= +apply_deriv_weights=true +use_smart_splitting=true +do_finetuning=false + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size="150=128/300=64" +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri3_${supervised_set}_unk_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + [ ! -z "$sup_frames_per_eg" ] && frames_per_eg=$sup_frames_per_eg + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + if $use_smart_splitting; then + get_egs_script=steps/nnet3/chain/get_egs_split.sh + else + get_egs_script=steps/nnet3/chain/get_egs.sh + fi + + echo "$0: generating egs from the unsupervised data" + $get_egs_script --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +test_graph_dir=$dir/graph${test_graph_affix} +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 ${test_lang} $dir $test_graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $test_graph_dir data/${decode_set}_hires \ + $dir/decode${test_graph_affix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $test_graph_dir data/${decode_set}_hires \ + $dir/decode${test_graph_affix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_al.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_al.sh new file mode 100644 index 00000000000..4a12d96c5bf --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_al.sh @@ -0,0 +1,471 @@ +#!/bin/bash + +# This script is same as _aj, but uses pocolm for LM. +# sup_frames_per_eg=150 +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 5,2 +# LM for decoding unsupervised data: 4gram +# Supervision: Smart split lattices + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup15k_250k # for reference +exp=exp/semisup_15k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup15k +semi_affix=semi15k_250k # affix relating train-set splitting proportion + +tdnn_affix=7j # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix=_unphdet +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg=150 # if empty will be equal to the supervised model's config unless sup_frames_per_eg + # -- you will need to change minibatch_size for comb training accordingly +sup_frames_per_eg=150 +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1al # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix=bi_j +unsup_egs_opts= +apply_deriv_weights=true +use_smart_splitting=true +do_finetuning=false + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size="150=128/300=64" +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri3_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + [ ! -z "$sup_frames_per_eg" ] && frames_per_eg=$sup_frames_per_eg + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + if $use_smart_splitting; then + get_egs_script=steps/nnet3/chain/get_egs_split.sh + else + get_egs_script=steps/nnet3/chain/get_egs.sh + fi + + echo "$0: generating egs from the unsupervised data" + $get_egs_script --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +test_graph_dir=$dir/graph${test_graph_affix} +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 ${test_lang} $dir $test_graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $test_graph_dir data/${decode_set}_hires \ + $dir/decode${test_graph_affix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $test_graph_dir data/${decode_set}_hires \ + $dir/decode${test_graph_affix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_am.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_am.sh new file mode 100644 index 00000000000..d0e21e0c25a --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_am.sh @@ -0,0 +1,470 @@ +#!/bin/bash + +# This script is similar to _al, but builds a larger tree using unsupervised data. +# sup_frames_per_eg=150 +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 5,2 +# LM for decoding unsupervised data: 4gram +# Supervision: Smart split lattices + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup15k_250k # for reference +exp=exp/semisup_15k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup15k +semi_affix=semi15k_250k # affix relating train-set splitting proportion + +tdnn_affix=7j # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix=_unphdet +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg=150 # if empty will be equal to the supervised model's config unless sup_frames_per_eg + # -- you will need to change minibatch_size for comb training accordingly +sup_frames_per_eg=150 +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1am # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix= +unsup_egs_opts= +apply_deriv_weights=true +do_finetuning=false + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size="150=128/300=64" +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor + + sup_ali_dir=$exp/tri3 + + if [ -f $treedir/final.mdl ]; then + echo "$0: $treedir/final.mdl exists. Remove it and run again." + exit 1 + fi + + steps/nnet3/chain/build_tree_multiple_sources.sh \ + --use-fmllr false --context-opts "--context-width=2 --central-position=1" \ + --frame-subsampling-factor 3 \ + 7000 $lang \ + data/${supervised_set} \ + ${sup_ali_dir} \ + data/${unsupervised_set} \ + $chaindir/best_path_${unsupervised_set}${decode_affix} \ + $treedir || exit 1 +fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri3_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + [ ! -z "$sup_frames_per_eg" ] && frames_per_eg=$sup_frames_per_eg + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +test_graph_dir=$dir/graph${test_graph_affix} +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 ${test_lang} $dir $test_graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $test_graph_dir data/${decode_set}_hires \ + $dir/decode${test_graph_affix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $test_graph_dir data/${decode_set}_hires \ + $dir/decode${test_graph_affix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_an.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_an.sh new file mode 100644 index 00000000000..eb6cdca6b0c --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_an.sh @@ -0,0 +1,471 @@ +#!/bin/bash + +# This script is same as _al, but different frames-per-eg for supervised. +# sup_frames_per_eg=150 +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 5,2 +# LM for decoding unsupervised data: 4gram +# Supervision: Smart split lattices + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup15k_250k # for reference +exp=exp/semisup_15k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup15k +semi_affix=semi15k_250k # affix relating train-set splitting proportion + +tdnn_affix=7j # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix=_unphdet +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg=150 # if empty will be equal to the supervised model's config unless sup_frames_per_eg + # -- you will need to change minibatch_size for comb training accordingly +sup_frames_per_eg=160,140,110,80 +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1an # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix=bi_j +unsup_egs_opts= +apply_deriv_weights=true +use_smart_splitting=true +do_finetuning=false + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size="150=128/300=64" +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri3_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + [ ! -z "$sup_frames_per_eg" ] && frames_per_eg=$sup_frames_per_eg + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + if $use_smart_splitting; then + get_egs_script=steps/nnet3/chain/get_egs_split.sh + else + get_egs_script=steps/nnet3/chain/get_egs.sh + fi + + echo "$0: generating egs from the unsupervised data" + $get_egs_script --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +test_graph_dir=$dir/graph${test_graph_affix} +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 ${test_lang} $dir $test_graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $test_graph_dir data/${decode_set}_hires \ + $dir/decode${test_graph_affix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $test_graph_dir data/${decode_set}_hires \ + $dir/decode${test_graph_affix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_ao.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_ao.sh new file mode 100644 index 00000000000..b4d83f83a73 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_ao.sh @@ -0,0 +1,471 @@ +#!/bin/bash + +# This script is same as _ak, but uses different frames-per-eg for supervised. +# sup_frames_per_eg=160,140,110,80 +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 5,2 +# LM for decoding unsupervised data: 4gram +# Supervision: Smart split lattices + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup15k_250k # for reference +exp=exp/semisup_15k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup15k +semi_affix=semi15k_250k # affix relating train-set splitting proportion + +tdnn_affix=7i # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix=_unphdet +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg=150 # if empty will be equal to the supervised model's config unless sup_frames_per_eg + # -- you will need to change minibatch_size for comb training accordingly +sup_frames_per_eg=160,140,110,80 +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1ao # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix=bi_i +unsup_egs_opts= +apply_deriv_weights=true +use_smart_splitting=true +do_finetuning=false + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size="150=128/300=64" +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri3_${supervised_set}_unk_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + [ ! -z "$sup_frames_per_eg" ] && frames_per_eg=$sup_frames_per_eg + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + if $use_smart_splitting; then + get_egs_script=steps/nnet3/chain/get_egs_split.sh + else + get_egs_script=steps/nnet3/chain/get_egs.sh + fi + + echo "$0: generating egs from the unsupervised data" + $get_egs_script --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +test_graph_dir=$dir/graph${test_graph_affix} +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 ${test_lang} $dir $test_graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $test_graph_dir data/${decode_set}_hires \ + $dir/decode${test_graph_affix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $test_graph_dir data/${decode_set}_hires \ + $dir/decode${test_graph_affix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_c.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_c.sh index f84d4715f7c..b24ce252642 100755 --- a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_c.sh +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_c.sh @@ -2,6 +2,7 @@ set -e # This is fisher chain recipe for training a model on a subset of around 50 hours. +# This is similar to _b, but uses biphone tree with upto 7000 leaves. # configs for 'chain' stage=0 @@ -10,8 +11,9 @@ train_stage=-10 get_egs_stage=-10 decode_iter= train_set=train_sup50k -ivector_train_set=semisup50k_250k -tree_affix= +unsup_train_set=train_unsup100k_250k +semisup_train_set=semisup50k_100k_250k +tree_affix=bi_c nnet3_affix=_semi50k_250k chain_affix=_semi50k_250k exp=exp/semisup_50k @@ -45,17 +47,18 @@ treedir=$exp/chain${chain_affix}/tree_${tree_affix} lat_dir=$exp/chain${chain_affix}/$(basename $gmm_dir)_${train_set}_sp_lats # training lattices directory dir=$exp/chain${chain_affix}/tdnn${tdnn_affix}_sp train_data_dir=data/${train_set}_sp_hires -train_ivector_dir=$exp/nnet3${nnet3_affix}/ivectors_${ivector_train_set}_sp_hires +train_ivector_dir=$exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires lang=data/lang_chain # The iVector-extraction and feature-dumping parts are the same as the standard # nnet3 setup, and you can skip them by setting "--stage 8" if you have already # run those things. -local/nnet3/run_ivector_common_pca.sh --stage $stage --exp $exp \ +local/semisup/nnet3/run_ivector_common_pca.sh --stage $stage --exp $exp \ --speed-perturb true \ --train-set $train_set \ - --ivector-train-set $ivector_train_set \ + --unsup-train-set $unsup_train_set \ + --semisup-train-set $semisup_train_set \ --nnet3-affix $nnet3_affix || exit 1 if [ $stage -le 9 ]; then @@ -66,6 +69,15 @@ if [ $stage -le 9 ]; then rm $lat_dir/fsts.*.gz # save space fi +if [ $stage -le 10 ]; then + utils/data/modify_speaker_info.sh --utts-per-spk-max 2 \ + ${train_data_dir} ${train_data_dir}_max2 + + steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \ + ${train_data_dir}_max2 $exp/nnet3${nnet3_affix}/extractor \ + $exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires +fi + if [ $stage -le 10 ]; then # Create a version of the lang/ directory that has one state per phone in the # topo file. [note, it really has two states.. the first one is only repeated diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_d.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_d.sh new file mode 100755 index 00000000000..b2fb59e4fed --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_d.sh @@ -0,0 +1,208 @@ +#!/bin/bash +set -e + +# This is fisher chain recipe for training a model on a subset of around 50 hours. +# This is similar to _c, but uses a phone LM UNK model + +# configs for 'chain' +stage=0 +tdnn_affix=7d +train_stage=-10 +get_egs_stage=-10 +decode_iter= +train_set=train_sup50k +unsup_train_set=train_unsup100k_250k +semisup_train_set=semisup50k_100k_250k +tree_affix=bi_d +nnet3_affix=_semi50k_250k +chain_affix=_semi50k_250k +exp=exp/semisup_50k +gmm=tri4a +xent_regularize=0.1 +hidden_dim=500 + +# training options +num_epochs=8 +remove_egs=false +common_egs_dir= +minibatch_size=128 + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate -1 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 7000 data/${train_set} $lang $gmm_dir $treedir || exit 1 +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + mkdir -p $dir/egs + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$common_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize 0.1 \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true" \ + --egs.chunk-width 160,140,110,80 \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph_poco_unk +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_poco_test_unk $dir $graph_dir +fi + +decode_suff= +if [ $stage -le 15 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}${decode_suff} || exit 1; + ) & + done +fi +wait; +exit 0; diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_e.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_e.sh new file mode 100755 index 00000000000..d1085716430 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_e.sh @@ -0,0 +1,208 @@ +#!/bin/bash +set -e + +# This is fisher chain recipe for training a model on a subset of around 50 hours. +# This is similar to _c, but uses poco LM for decoding. + +# configs for 'chain' +stage=0 +tdnn_affix=7e +train_stage=-10 +get_egs_stage=-10 +decode_iter= +train_set=train_sup50k +unsup_train_set=train_unsup100k_250k +semisup_train_set=semisup50k_100k_250k +tree_affix=bi_e +nnet3_affix=_semi50k_250k +chain_affix=_semi50k_250k +exp=exp/semisup_50k +gmm=tri4a +xent_regularize=0.1 +hidden_dim=500 + +# training options +num_epochs=8 +remove_egs=false +common_egs_dir= +minibatch_size=128 + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate -1 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 7000 data/${train_set} $lang $gmm_dir $treedir || exit 1 +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + mkdir -p $dir/egs + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$common_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize 0.1 \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true" \ + --egs.chunk-width 160,140,110,80 \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph_poco +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_poco_test $dir $graph_dir +fi + +decode_suff= +if [ $stage -le 15 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}${decode_suff} || exit 1; + ) & + done +fi +wait; +exit 0; diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_l.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_l.sh new file mode 100755 index 00000000000..a4971398133 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_l.sh @@ -0,0 +1,472 @@ +#!/bin/bash + +# This script is same as _k, but uses biphone tree. +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 3,2 +# LM for decoding unsupervised data: 4gram +# Supervision: Smart split lattices + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup50k_250k # for reference +exp=exp/semisup_50k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup50k +semi_affix=semi50k_250k # affix relating train-set splitting proportion + +tdnn_affix=7d # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix=_undet +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1l # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=3,2 +sup_egs_dir= +unsup_egs_dir= +comb_egs_dir= +tree_affix=bi_d +unsup_egs_opts= +apply_deriv_weights=true +use_smart_splitting=true + +do_finetuning=false + +extra_left_context=0 +extra_right_context=0 + +train_extra_opts= + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size="150=128/300=64" +# to tune: +# frames_per_eg for unsupervised + +decode_iter= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +if $use_smart_splitting; then + comb_affix=${comb_affix:+${comb_affix}_smart} +else + comb_affix=${comb_affix:+${comb_affix}_naive} +fi + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri4a_${supervised_set}_lats +if [ -z "$comb_egs_dir" ] && [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + if [ -z "$comb_egs_dir" ]; then + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) + fi +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} + +if [ -z "$comb_egs_dir" ] && [ -z "$unsup_egs_dir" ]; then + [ -z $unsup_frames_per_eg ] && [ ! -z "$frames_per_eg" ] && unsup_frames_per_eg=$frames_per_eg + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + if $use_smart_splitting; then + get_egs_script=steps/nnet3/chain/get_egs_split.sh + else + get_egs_script=steps/nnet3/chain/get_egs.sh + fi + + $get_egs_script --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +if [ -z "$comb_egs_dir" ]; then + comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + + if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. + fi +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width `cat $comb_egs_dir/info/frames_per_eg` \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir ${train_extra_opts} || exit 1; +fi + +graph_dir=$dir/graph${test_graph_affix} +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 $test_lang $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${test_graph_affix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_m.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_m.sh new file mode 100755 index 00000000000..08b78f675f2 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_m.sh @@ -0,0 +1,472 @@ +#!/bin/bash + +# This script is same as _k, but uses biphone tree. +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 3,2 +# LM for decoding unsupervised data: 4gram +# Supervision: Smart split lattices + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup50k_250k # for reference +exp=exp/semisup_50k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup50k +semi_affix=semi50k_250k # affix relating train-set splitting proportion + +tdnn_affix=7d # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix=_undet +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1m # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=3,2 +sup_egs_dir= +unsup_egs_dir= +comb_egs_dir= +tree_affix=bi_d +unsup_egs_opts= +apply_deriv_weights=true +use_smart_splitting=true + +do_finetuning=false + +extra_left_context=0 +extra_right_context=0 + +train_extra_opts= + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size="150=128/300=64" +# to tune: +# frames_per_eg for unsupervised + +decode_iter= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +if $use_smart_splitting; then + comb_affix=${comb_affix:+${comb_affix}_smart} +else + comb_affix=${comb_affix:+${comb_affix}_naive} +fi + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri4a_${supervised_set}_unk_lats +if [ -z "$comb_egs_dir" ] && [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + if [ -z "$comb_egs_dir" ]; then + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) + fi +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} + +if [ -z "$comb_egs_dir" ] && [ -z "$unsup_egs_dir" ]; then + [ -z $unsup_frames_per_eg ] && [ ! -z "$frames_per_eg" ] && unsup_frames_per_eg=$frames_per_eg + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + if $use_smart_splitting; then + get_egs_script=steps/nnet3/chain/get_egs_split.sh + else + get_egs_script=steps/nnet3/chain/get_egs.sh + fi + + $get_egs_script --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +if [ -z "$comb_egs_dir" ]; then + comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + + if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. + fi +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width `cat $comb_egs_dir/info/frames_per_eg` \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir ${train_extra_opts} || exit 1; +fi + +graph_dir=$dir/graph${test_graph_affix} +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 $test_lang $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${test_graph_affix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_n.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_n.sh new file mode 100755 index 00000000000..aab0d7a2c6a --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_n.sh @@ -0,0 +1,472 @@ +#!/bin/bash + +# This script is same as _m, but does not use UNK LM. +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 3,2 +# LM for decoding unsupervised data: 4gram +# Supervision: Smart split lattices + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup50k_250k # for reference +exp=exp/semisup_50k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup50k +semi_affix=semi50k_250k # affix relating train-set splitting proportion + +tdnn_affix=7e # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix=_undet +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1n # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=3,2 +sup_egs_dir= +unsup_egs_dir= +comb_egs_dir= +tree_affix=bi_e +unsup_egs_opts= +apply_deriv_weights=true +use_smart_splitting=true + +do_finetuning=false + +extra_left_context=0 +extra_right_context=0 + +train_extra_opts= + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size="150=128/300=64" +# to tune: +# frames_per_eg for unsupervised + +decode_iter= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +if $use_smart_splitting; then + comb_affix=${comb_affix:+${comb_affix}_smart} +else + comb_affix=${comb_affix:+${comb_affix}_naive} +fi + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri4a_${supervised_set}_lats +if [ -z "$comb_egs_dir" ] && [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + if [ -z "$comb_egs_dir" ]; then + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) + fi +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} + +if [ -z "$comb_egs_dir" ] && [ -z "$unsup_egs_dir" ]; then + [ -z $unsup_frames_per_eg ] && [ ! -z "$frames_per_eg" ] && unsup_frames_per_eg=$frames_per_eg + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + if $use_smart_splitting; then + get_egs_script=steps/nnet3/chain/get_egs_split.sh + else + get_egs_script=steps/nnet3/chain/get_egs.sh + fi + + $get_egs_script --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +if [ -z "$comb_egs_dir" ]; then + comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + + if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. + fi +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width `cat $comb_egs_dir/info/frames_per_eg` \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir ${train_extra_opts} || exit 1; +fi + +graph_dir=$dir/graph${test_graph_affix} +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 $test_lang $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${test_graph_affix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/nnet3/run_ivector_common_pca.sh b/egs/fisher_english/s5/local/semisup/nnet3/run_ivector_common.sh similarity index 100% rename from egs/fisher_english/s5/local/semisup/nnet3/run_ivector_common_pca.sh rename to egs/fisher_english/s5/local/semisup/nnet3/run_ivector_common.sh diff --git a/egs/fisher_english/s5/local/semisup/run_15k_unk.sh b/egs/fisher_english/s5/local/semisup/run_15k_unk.sh deleted file mode 100644 index 41590dd9fe2..00000000000 --- a/egs/fisher_english/s5/local/semisup/run_15k_unk.sh +++ /dev/null @@ -1,74 +0,0 @@ -#!/bin/bash - -# Copyright 2017 Vimal Manohar -# Apache 2.0 - -. cmd.sh -. path.sh - -stage=-1 -train_stage=-10 - -. utils/parse_options.sh - -set -o pipefail -exp=exp/semisup_15k - -false && { -utils/subset_data_dir.sh --speakers data/train_sup 15000 data/train_sup15k || exit 1 -utils/subset_data_dir.sh --shortest data/train_sup15k 5000 data/train_sup15k_short || exit 1 -utils/subset_data_dir.sh data/train_sup15k 7500 data/train_sup15k_half || exit 1 - -steps/train_mono.sh --nj 10 --cmd "$train_cmd" \ - data/train_sup15k_short data/lang $exp/mono0a || exit 1 - -steps/align_si.sh --nj 30 --cmd "$train_cmd" \ - data/train_sup15k_half data/lang $exp/mono0a $exp/mono0a_ali || exit 1 - -steps/train_deltas.sh --cmd "$train_cmd" \ - 2000 10000 data/train_sup15k_half data/lang $exp/mono0a_ali $exp/tri1 || exit 1 - -(utils/mkgraph.sh data/lang_test $exp/tri1 $exp/tri1/graph - steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ - $exp/tri1/graph data/dev $exp/tri1/decode_dev)& - -steps/align_si.sh --nj 30 --cmd "$train_cmd" \ - data/train_sup15k data/lang $exp/tri1 $exp/tri1_ali || exit 1; - -steps/train_lda_mllt.sh --cmd "$train_cmd" \ - 2500 15000 data/train_sup15k data/lang $exp/tri1_ali $exp/tri2 || exit 1; - -(utils/mkgraph.sh data/lang_test $exp/tri2 $exp/tri2/graph - steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ - $exp/tri2/graph data/dev $exp/tri2/decode_dev)& - -steps/align_si.sh --nj 30 --cmd "$train_cmd" \ - data/train_sup15k data/lang $exp/tri2 $exp/tri2_ali || exit 1; - -steps/train_sat.sh --cmd "$train_cmd" \ - 2500 15000 data/train_sup15k data/lang $exp/tri2_ali $exp/tri3 || exit 1; - -( - utils/mkgraph.sh data/lang_test $exp/tri3 $exp/tri3/graph - steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ - $exp/tri3/graph data/dev $exp/tri3/decode_dev -)& - -utils/combine_data.sh data/semisup15k_250k data/train_sup15k data/train_unsup250k || exit 1 - -local/semisup/chain/tuning/run_tdnn_11k.sh \ - --train-set train_sup15k \ - --nnet3-affix _semi15k_250k \ - --chain-affix _semi15k_250k \ - --stage $stage --train-stage $train_stage \ - --exp $exp \ - --ivector-train-set semisup15k_250k || exit 1 -} - -false && local/semisup/chain/tuning/run_tdnn_oracle.sh \ - --train-set semisup15k_250k \ - --nnet3-affix _semi15k_250k \ - --chain-affix _semi15k_250k_oracle \ - --stage 9 --train-stage $train_stage \ - --exp $exp \ - --ivector-train-set semisup15k_250k || exit 1 diff --git a/egs/wsj/s5/steps/nnet3/chain/get_egs.sh b/egs/wsj/s5/steps/nnet3/chain/get_egs.sh index 20fcd5db802..e2f9526be34 100755 --- a/egs/wsj/s5/steps/nnet3/chain/get_egs.sh +++ b/egs/wsj/s5/steps/nnet3/chain/get_egs.sh @@ -78,7 +78,6 @@ phone_insertion_penalty= deriv_weights_scp= generate_egs_scp=false no_chunking=false -lat_copy_src= echo "$0 $@" # Print the command line for logging @@ -307,12 +306,12 @@ chain_supervision_all_opts="--lattice-input=true --frame-subsampling-factor=$ali normalization_scale=1.0 -lattice_copy_cmd="ark:-" +lats_rspecifier="ark:gunzip -c $latdir/lat.JOB.gz |" if [ ! -z $lattice_prune_beam ]; then if [ "$lattice_prune_beam" == "0" ] || [ "$lattice_prune_beam" == "0.0" ]; then - lattice_copy_cmd="ark:- | lattice-1best --acoustic-scale=$acwt ark:- ark:-" + lats_rspecifier="$lats_rspecifier lattice-1best --acoustic-scale=$acwt ark:- ark:- |" else - lattice_copy_cmd="ark:- | lattice-prune --acoustic-scale=$acwt --beam=$lattice_prune_beam ark:- ark:-" + lats_rspecifier="$lats_rspecifier lattice-prune --acoustic-scale=$acwt --beam=$lattice_prune_beam ark:- ark:- |" fi fi @@ -346,31 +345,20 @@ echo $left_context_initial > $dir/info/left_context_initial echo $right_context_final > $dir/info/right_context_final if [ $stage -le 2 ]; then - if [ ! -z "$lat_copy_src" ]; then - ln -sf `readlink -f $lat_copy_src`/lat.*.{ark,scp} $dir/ - cat $lat_copy_src/lat.{?,??}.scp > $dir/lat.scp - fi - echo "$0: Getting validation and training subset examples in background." rm $dir/.error 2>/dev/null ( - if [ -z "$lat_copy_src" ]; then - $cmd --max-jobs-run 6 JOB=1:$nj $dir/log/lattice_copy.JOB.log \ - lattice-copy --include="cat $dir/valid_uttlist $dir/train_subset_uttlist |" --ignore-missing \ - "ark:gunzip -c $latdir/lat.JOB.gz|" \ - ark,scp:$dir/lat_special.JOB.ark,$dir/lat_special.JOB.scp || exit 1 + $cmd --max-jobs-run 6 JOB=1:$nj $dir/log/lattice_copy.JOB.log \ + lattice-copy --include="cat $dir/valid_uttlist $dir/train_subset_uttlist |" --ignore-missing \ + "$lats_rspecifier" \ + ark,scp:$dir/lat_special.JOB.ark,$dir/lat_special.JOB.scp || exit 1 - for id in $(seq $nj); do cat $dir/lat_special.$id.scp; done > $dir/lat_special.scp - else - # do the filtering just once, as lat.scp may be long. - utils/filter_scp.pl <(cat $dir/valid_uttlist $dir/train_subset_uttlist) \ - <$dir/lat.scp >$dir/lat_special.scp - fi + for id in $(seq $nj); do cat $dir/lat_special.$id.scp; done > $dir/lat_special.scp $cmd $dir/log/create_valid_subset.log \ utils/filter_scp.pl $dir/valid_uttlist $dir/lat_special.scp \| \ - lattice-align-phones --replace-output-symbols=true $latdir/final.mdl scp:- $lattice_copy_cmd \| \ + lattice-align-phones --replace-output-symbols=true $latdir/final.mdl scp:- ark:- \| \ chain-get-supervision $chain_supervision_all_opts $chaindir/tree $chaindir/0.trans_mdl \ ark:- ark:- \| \ nnet3-chain-get-egs $ivector_opts --srand=$srand \ @@ -378,7 +366,7 @@ if [ $stage -le 2 ]; then "$valid_feats" ark,s,cs:- "ark:$dir/valid_all.cegs" || exit 1 & $cmd $dir/log/create_train_subset.log \ utils/filter_scp.pl $dir/train_subset_uttlist $dir/lat_special.scp \| \ - lattice-align-phones --replace-output-symbols=true $latdir/final.mdl scp:- $lattice_copy_cmd \| \ + lattice-align-phones --replace-output-symbols=true $latdir/final.mdl scp:- ark:- \| \ chain-get-supervision $chain_supervision_all_opts \ $chaindir/tree $chaindir/0.trans_mdl ark:- ark:- \| \ nnet3-chain-get-egs $ivector_opts --srand=$srand \ @@ -443,14 +431,9 @@ if [ $stage -le 4 ]; then # files is the product of 'nj' by 'num_archives_intermediate', which might be # quite large. - lattice_rspecifier="ark:gunzip -c $latdir/lat.JOB.gz |" - if [ ! -z "$lat_copy_src" ]; then - lattice_rspecifier="scp:utils/filter_scp.pl $sdata/JOB/utt2spk $dir/lat.scp |" - fi - $cmd --max-jobs-run $max_jobs_run JOB=1:$nj $dir/log/get_egs.JOB.log \ lattice-align-phones --replace-output-symbols=true $latdir/final.mdl \ - "$lattice_rspecifier" $lattice_copy_cmd \| \ + "$lats_rspecifier" ark:- \| \ chain-get-supervision $chain_supervision_all_opts \ --weight=$egs_weight \ $chaindir/tree $chaindir/0.trans_mdl ark:- ark:- \| \ diff --git a/egs/wsj/s5/steps/nnet3/chain/get_egs_split.sh b/egs/wsj/s5/steps/nnet3/chain/get_egs_split.sh index 786db3fe31f..e0fd6b5c01a 100755 --- a/egs/wsj/s5/steps/nnet3/chain/get_egs_split.sh +++ b/egs/wsj/s5/steps/nnet3/chain/get_egs_split.sh @@ -77,7 +77,6 @@ acwt=0.1 # For pruning phone_insertion_penalty= deriv_weights_scp= generate_egs_scp=false -lat_copy_src= echo "$0 $@" # Print the command line for logging @@ -289,12 +288,12 @@ chain_supervision_all_opts="--supervision.frame-subsampling-factor=$alignment_su normalization_scale=1.0 -lattice_copy_cmd="ark:-" +lats_rspecifier="ark:gunzip -c $latdir/lat.JOB.gz |" if [ ! -z $lattice_prune_beam ]; then if [ "$lattice_prune_beam" == "0" ] || [ "$lattice_prune_beam" == "0.0" ]; then - lattice_copy_cmd="ark:- | lattice-1best --acoustic-scale=$acwt ark:- ark:-" + lats_rspecifier="$lats_rspecifier lattice-1best --acoustic-scale=$acwt ark:- ark:- |" else - lattice_copy_cmd="ark:- | lattice-prune --write-compact=false --acoustic-scale=$acwt --beam=$lattice_prune_beam ark:- ark:-" + lats_rspecifier="$lats_rspecifier lattice-prune --write-compact=false --acoustic-scale=$acwt --beam=$lattice_prune_beam ark:- ark:- |" fi fi @@ -330,39 +329,28 @@ echo $left_context_initial > $dir/info/left_context_initial echo $right_context_final > $dir/info/right_context_final if [ $stage -le 2 ]; then - if [ ! -z "$lat_copy_src" ]; then - ln -sf `readlink -f $lat_copy_src`/lat.*.{ark,scp} $dir/ - cat $lat_copy_src/lat.{?,??}.scp > $dir/lat.scp - fi - echo "$0: Getting validation and training subset examples in background." rm $dir/.error 2>/dev/null ( - if [ -z "$lat_copy_src" ]; then - $cmd --max-jobs-run 6 JOB=1:$nj $dir/log/lattice_copy.JOB.log \ - lattice-copy --include="cat $dir/valid_uttlist $dir/train_subset_uttlist |" --ignore-missing \ - --write-compact=false \ - "ark:gunzip -c $latdir/lat.JOB.gz|" \ - ark,scp:$dir/lat_special.JOB.ark,$dir/lat_special.JOB.scp || exit 1 + $cmd --max-jobs-run 6 JOB=1:$nj $dir/log/lattice_copy.JOB.log \ + lattice-copy --include="cat $dir/valid_uttlist $dir/train_subset_uttlist |" --ignore-missing \ + --write-compact=false \ + "$lats_rspecifier" \ + ark,scp:$dir/lat_special.JOB.ark,$dir/lat_special.JOB.scp || exit 1 - for id in $(seq $nj); do cat $dir/lat_special.$id.scp; done > $dir/lat_special.scp - else - # do the filtering just once, as lat.scp may be long. - utils/filter_scp.pl <(cat $dir/valid_uttlist $dir/train_subset_uttlist) \ - <$dir/lat.scp >$dir/lat_special.scp - fi + for id in $(seq $nj); do cat $dir/lat_special.$id.scp; done > $dir/lat_special.scp $cmd $dir/log/create_valid_subset.log \ utils/filter_scp.pl $dir/valid_uttlist $dir/lat_special.scp \| \ - lattice-align-phones --write-compact=false --replace-output-symbols=true $latdir/final.mdl scp:- $lattice_copy_cmd \| \ + lattice-align-phones --write-compact=false --replace-output-symbols=true $latdir/final.mdl scp:- ark:- \| \ nnet3-chain-split-and-get-egs $chain_supervision_all_opts $ivector_opts --srand=$srand \ $egs_opts $chaindir/normalization.fst \ "$valid_feats" $chaindir/tree $chaindir/0.trans_mdl \ ark,s,cs:- "ark:$dir/valid_all.cegs" || exit 1 & $cmd $dir/log/create_train_subset.log \ utils/filter_scp.pl $dir/train_subset_uttlist $dir/lat_special.scp \| \ - lattice-align-phones --write-compact=false --replace-output-symbols=true $latdir/final.mdl scp:- $lattice_copy_cmd \| \ + lattice-align-phones --write-compact=false --replace-output-symbols=true $latdir/final.mdl scp:- ark:- \| \ nnet3-chain-split-and-get-egs $chain_supervision_all_opts $ivector_opts --srand=$srand \ $egs_opts $chaindir/normalization.fst \ "$train_subset_feats" $chaindir/tree $chaindir/0.trans_mdl \ @@ -426,15 +414,12 @@ if [ $stage -le 4 ]; then # files is the product of 'nj' by 'num_archives_intermediate', which might be # quite large. - lattice_rspecifier="ark:gunzip -c $latdir/lat.JOB.gz |" - if [ ! -z "$lat_copy_src" ]; then - lattice_rspecifier="scp:utils/filter_scp.pl $sdata/JOB/utt2spk $dir/lat.scp |" - fi - $cmd --max-jobs-run $max_jobs_run JOB=1:$nj $dir/log/get_egs.JOB.log \ lattice-align-phones --write-compact=false --replace-output-symbols=true $latdir/final.mdl \ - "$lattice_rspecifier" $lattice_copy_cmd \| \ - nnet3-chain-split-and-get-egs $chain_supervision_all_opts $ivector_opts --srand=\$[JOB+$srand] $egs_opts --supervision.weight=$egs_weight \ + "$lats_rspecifier" ark:- \| \ + nnet3-chain-split-and-get-egs $chain_supervision_all_opts \ + --supervision.weight=$egs_weight \ + $ivector_opts --srand=\$[JOB+$srand] $egs_opts \ --num-frames-overlap=$frames_overlap_per_eg \ "$feats" $chaindir/tree $chaindir/0.trans_mdl \ ark,s,cs:- ark:- \| \ @@ -495,7 +480,7 @@ if [ $stage -le 5 ]; then if $generate_egs_scp; then #concatenate cegs.JOB.scp in single cegs.scp - rm -rf $dir/cegs.scp + rm -f $dir/cegs.scp for j in $(seq $num_archives_intermediate); do for y in $(seq $archives_multiple); do cat $dir/cegs.$j.$y.scp || exit 1; From 76cc0a0eebf95ba745b88c1f39aabfd8e47befef Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Tue, 28 Nov 2017 18:20:29 -0500 Subject: [PATCH 102/174] semisup-smbr: Adding more recipes --- .../run_tdnn_100k_semisupervised_conf_a.sh | 4 +-- .../run_tdnn_100k_semisupervised_conf_b.sh | 3 +- .../run_tdnn_100k_semisupervised_conf_g.sh | 28 ++++++++++++------- .../semisup/chain/tuning/run_tdnn_11k.sh | 10 +++---- .../semisup/chain/tuning/run_tdnn_15k_h.sh | 2 +- .../semisup/chain/tuning/run_tdnn_50k.sh | 2 +- .../semisup/chain/tuning/run_tdnn_oracle.sh | 3 +- .../local/semisup/nnet3/run_ivector_common.sh | 10 ++++--- .../s5/steps/libs/nnet3/report/log_parse.py | 4 +-- 9 files changed, 38 insertions(+), 28 deletions(-) diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_conf_a.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_conf_a.sh index 7d956149ef7..17e0d7609a8 100644 --- a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_conf_a.sh +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_conf_a.sh @@ -145,8 +145,8 @@ for dset in $unsupervised_set; do fi if [ $stage -le 6 ]; then - steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ - --write-compact false --determinize false --skip-scoring true \ + steps/lmrescore_const_arpa_undeterminized.sh --cmd "$decode_cmd" \ + --write-compact false --acwt 0.1 --beam 8.0 --skip-scoring true \ data/lang_test${graph_affix} \ data/lang_test${graph_affix}_fg data/${dset}_sp_hires \ $chaindir/decode_${dset}_sp${decode_affix} \ diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_conf_b.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_conf_b.sh index 609e2009280..9331642f43b 100644 --- a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_conf_b.sh +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_conf_b.sh @@ -145,8 +145,7 @@ for dset in $unsupervised_set; do if [ $stage -le 6 ]; then steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ - --write-compact false --read-determinized false --write-determinized false \ - --skip-scoring true \ + --acwt 0.1 --beam 8.0 --write-compact false --skip-scoring true \ data/lang_test${graph_affix} \ data/lang_test${graph_affix}_fg data/${dset}_sp_hires \ $chaindir/decode_${dset}_sp${decode_affix} \ diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_conf_g.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_conf_g.sh index e62c52fe907..dbfde9787aa 100644 --- a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_conf_g.sh +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_conf_g.sh @@ -37,6 +37,7 @@ lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting e tolerance=1 graph_affix=_sup100k # can be used to decode the unsup data with another lm/graph phone_insertion_penalty= +rescore_unsup_lattices=true # Semi-supervised options comb_affix=comb1g # affix for new chain-model directory trained on the combined supervised+unsupervised subsets @@ -146,18 +147,25 @@ for dset in $unsupervised_set; do $graphdir data/${dset}_sp_hires $chaindir/decode_${dset}_sp${decode_affix} fi - #if [ $stage -le 6 ]; then - # steps/lmrescore_const_arpa_undeterminized.sh --cmd "$decode_cmd" \ - # --write-compact false --acwt 0.1 --beam 8.0 --skip-scoring true \ - # data/lang_test${graph_affix} \ - # data/lang_test${graph_affix}_fg data/${dset}_sp_hires \ - # $chaindir/decode_${dset}_sp${decode_affix} \ - # $chaindir/decode_${dset}_sp${decode_affix}_fg - - #fi - ln -sf ../final.mdl $chaindir/decode_${dset}_sp${decode_affix}/final.mdl + if $rescore_unsup_lattices; then + if [ $stage -le 6 ]; then + steps/lmrescore_const_arpa_undeterminized.sh --cmd "$decode_cmd" \ + --write-compact false --acwt 0.1 --beam 8.0 --skip-scoring true \ + data/lang_test${graph_affix} \ + data/lang_test${graph_affix}_fg data/${dset}_sp_hires \ + $chaindir/decode_${dset}_sp${decode_affix} \ + $chaindir/decode_${dset}_sp${decode_affix}_fg + fi + ln -sf ../final.mdl $chaindir/decode_${dset}_sp${decode_affix}_fg/final.mdl + else + ln -sf ../final.mdl $chaindir/decode_${dset}_sp${decode_affix}/final.mdl + fi done +if $rescore_unsup_lattices; then + decode_affix=${decode_affix}_fg +fi + if [ $stage -le 8 ]; then steps/best_path_weights.sh --cmd "${train_cmd}" --acwt 0.1 \ data/${unsupervised_set}_sp_hires data/lang_chain \ diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k.sh index 81335e5ae5b..d4cb820a03b 100755 --- a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k.sh +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k.sh @@ -10,7 +10,7 @@ train_stage=-10 get_egs_stage=-10 decode_iter= train_set=train_sup11k -ivector_train_set=semisup11k_250k +unsup_train_set=train_unsup_250k_240k tree_affix= nnet3_affix=_semi11k_250k chain_affix=_semi11k_250k @@ -45,18 +45,18 @@ treedir=$exp/chain${chain_affix}/tree_${tree_affix} lat_dir=$exp/chain${chain_affix}/$(basename $gmm_dir)_${train_set}_sp_lats # training lattices directory dir=$exp/chain${chain_affix}/tdnn${tdnn_affix}_sp train_data_dir=data/${train_set}_sp_hires -train_ivector_dir=$exp/nnet3${nnet3_affix}/ivectors_${ivector_train_set}_sp_hires +train_ivector_dir=$exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires lang=data/lang_chain # The iVector-extraction and feature-dumping parts are the same as the standard # nnet3 setup, and you can skip them by setting "--stage 8" if you have already # run those things. -local/nnet3/run_ivector_common_pca.sh --stage $stage --exp $exp \ +local/semisup/nnet3/run_ivector_common_pca.sh --stage $stage --exp $exp \ --speed-perturb true \ --train-set $train_set \ - --ivector-train-set $ivector_train_set \ - --nnet3-affix $nnet3_affix || exit 1 + --unsup-train-set $unsup_train_set \ + --nnet3-affix "$nnet3_affix" || exit 1 if [ $stage -le 9 ]; then # Get the alignments as lattices (gives the chain training more freedom). diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_h.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_h.sh index 057513707f8..978df45345f 100644 --- a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_h.sh +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_h.sh @@ -2,7 +2,7 @@ set -e # This is fisher chain recipe for training a model on a subset of around 15 hours. -# This is similar to _c, but uses a biphone tree with up to 7000 leaves. +# This is similar to _d, but trains tree using even unsupervised data. # configs for 'chain' stage=0 diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k.sh index bf82256545a..6366cfdad3a 100755 --- a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k.sh +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k.sh @@ -1,7 +1,7 @@ #!/bin/bash set -e -# This is fisher chain recipe for training a model on a subset of around 10 hours. +# This is fisher chain recipe for training a model on a subset of around 50 hours. # configs for 'chain' stage=0 diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_oracle.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_oracle.sh index aa0e433c526..7b6f1716247 100755 --- a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_oracle.sh +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_oracle.sh @@ -56,7 +56,7 @@ local/nnet3/run_ivector_common_pca.sh --stage $stage --exp $exp \ --speed-perturb true \ --train-set $train_set \ --ivector-train-set $ivector_train_set \ - --nnet3-affix $nnet3_affix || exit 1 + --nnet3-affix "$nnet3_affix" || exit 1 if [ $stage -le 9 ]; then # Get the alignments as lattices (gives the chain training more freedom). @@ -140,6 +140,7 @@ if [ $stage -le 13 ]; then /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage fi + mkdir -p $dir/egs touch $dir/egs/.nodelete # keep egs around when that run dies. steps/nnet3/chain/train.py --stage $train_stage \ diff --git a/egs/fisher_english/s5/local/semisup/nnet3/run_ivector_common.sh b/egs/fisher_english/s5/local/semisup/nnet3/run_ivector_common.sh index aae97e145d5..718d1aaed04 100755 --- a/egs/fisher_english/s5/local/semisup/nnet3/run_ivector_common.sh +++ b/egs/fisher_english/s5/local/semisup/nnet3/run_ivector_common.sh @@ -52,8 +52,8 @@ if [ "$speed_perturb" == "true" ]; then fi fi -if [ $stage -le 2 ]; then - if [ ! -z "$unsup_train_set" ]; then +if [ ! -z "$unsup_train_set" ]; then + if [ $stage -le 2 ]; then utils/combine_data.sh data/${semisup_train_set}_sp \ data/${train_set}_sp data/${unsup_train_set}_sp fi @@ -91,8 +91,10 @@ fi ivector_train_set=${train_set}_sp if [ ! -z "$unsup_train_set" ]; then - utils/combine_data.sh data/${semisup_train_set}_sp_hires \ - data/${train_set}_sp_hires data/${unsup_train_set}_sp_hires + if [ $stage -le 3 ]; then + utils/combine_data.sh data/${semisup_train_set}_sp_hires \ + data/${train_set}_sp_hires data/${unsup_train_set}_sp_hires + fi ivector_train_set=${semisup_train_set}_sp fi diff --git a/egs/wsj/s5/steps/libs/nnet3/report/log_parse.py b/egs/wsj/s5/steps/libs/nnet3/report/log_parse.py index d21578baecb..6a66c60dd6b 100755 --- a/egs/wsj/s5/steps/libs/nnet3/report/log_parse.py +++ b/egs/wsj/s5/steps/libs/nnet3/report/log_parse.py @@ -398,7 +398,7 @@ def generate_acc_logprob_report(exp_dir, key="accuracy", output="output"): except: tb = traceback.format_exc() logger.warning("Error getting info from logs, exception was: " + tb) - times = [] + times = {} report = [] report.append("%Iter\tduration\ttrain_loss\tvalid_loss\tdifference") @@ -412,7 +412,7 @@ def generate_acc_logprob_report(exp_dir, key="accuracy", output="output"): try: report.append("%d\t%s\t%g\t%g\t%g" % (x[0], str(times[x[0]]), x[1], x[2], x[2]-x[1])) - except KeyError: + except KeyError, IndexError: continue total_time = 0 From 47ab45a99d7a8e58a7323be099e9f15a25fd0735 Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Tue, 28 Nov 2017 18:20:59 -0500 Subject: [PATCH 103/174] semisup-smbr: Add stages to scoring scripts --- egs/wsj/s5/steps/scoring/score_kaldi_wer.sh | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/egs/wsj/s5/steps/scoring/score_kaldi_wer.sh b/egs/wsj/s5/steps/scoring/score_kaldi_wer.sh index 9393f5616c5..6651a744e4d 100755 --- a/egs/wsj/s5/steps/scoring/score_kaldi_wer.sh +++ b/egs/wsj/s5/steps/scoring/score_kaldi_wer.sh @@ -62,11 +62,10 @@ fi mkdir -p $dir/scoring${scoring_affix} cat $data/text | $ref_filtering_cmd > $dir/scoring${scoring_affix}/test_filt.txt || exit 1; -if [ $stage -le 0 ]; then - - for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do - mkdir -p $dir/scoring${scoring_affix}/penalty_$wip/log +for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do + mkdir -p $dir/scoring${scoring_affix}/penalty_$wip/log + if [ $stage -le 0 ]; then if $decode_mbr ; then $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring${scoring_affix}/penalty_$wip/log/best_path.LMWT.log \ acwt=\`perl -e \"print 1.0/LMWT\"\`\; \ @@ -86,18 +85,19 @@ if [ $stage -le 0 ]; then utils/int2sym.pl -f 2- $symtab \| \ $hyp_filtering_cmd '>' $dir/scoring${scoring_affix}/penalty_$wip/LMWT.txt || exit 1; fi + fi + + if [ $stage -le 1 ]; then $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring${scoring_affix}/penalty_$wip/log/score.LMWT.log \ cat $dir/scoring${scoring_affix}/penalty_$wip/LMWT.txt \| \ compute-wer --text --mode=present \ ark:$dir/scoring${scoring_affix}/test_filt.txt ark,p:- ">&" $dir/wer_LMWT_$wip || exit 1; + fi - done -fi - - +done -if [ $stage -le 1 ]; then +if [ $stage -le 2 ]; then for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do for lmwt in $(seq $min_lmwt $max_lmwt); do From 37bb89760f78e82cd91674a6d657078ad109f86b Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Fri, 1 Dec 2017 00:42:45 -0500 Subject: [PATCH 104/174] semisup: unk model script --- egs/fisher_english/s5/local/run_unk_model.sh | 46 +++++-------------- .../semisup/chain/tuning/run_tdnn_50k_e.sh | 2 +- .../s5/local/semisup/run_100k.sh | 42 ++++++++++------- .../s5/local/semisup/run_15k.sh | 33 +++++++++++-- .../s5/local/semisup/run_50k.sh | 28 +++++++++++ 5 files changed, 95 insertions(+), 56 deletions(-) diff --git a/egs/fisher_english/s5/local/run_unk_model.sh b/egs/fisher_english/s5/local/run_unk_model.sh index 924203c5ff7..5e390549cc3 100755 --- a/egs/fisher_english/s5/local/run_unk_model.sh +++ b/egs/fisher_english/s5/local/run_unk_model.sh @@ -2,9 +2,12 @@ # Copyright 2017 Vimal Manohar +lang_dirs= + utils/lang/make_unk_lm.sh data/local/dict exp/unk_lang_model || exit 1 -utils/prepare_lang.sh --unk-fst exp/unk_lang_model/unk_fst.txt \ +utils/prepare_lang.sh \ + --unk-fst exp/unk_lang_model/unk_fst.txt \ data/local/dict "" data/local/lang data/lang_unk # note: it's important that the LM we built in data/lang/G.fst was created using @@ -12,44 +15,17 @@ utils/prepare_lang.sh --unk-fst exp/unk_lang_model/unk_fst.txt \ # keeps the graph compact after adding the unk model (we only have to add one # copy of it). -mkdir -p data/lang_poco_test_unk -cp -r data/lang_unk/* data/lang_poco_test_unk -cp data/lang_poco_test/G.fst data/lang_poco_test_unk/G.fst - -mkdir -p data/lang_poco_test_ex250k_unk -cp -r data/lang_unk/* data/lang_poco_test_ex250k_unk -cp data/lang_poco_test_ex250k/G.fst data/lang_poco_test_ex250k_unk/G.fst +for lang_dir in $lang_dirs; do + rm -r ${lang_dir}_unk 2>/dev/null || true + mkdir -p ${lang_dir}_unk + cp -r data/lang_unk ${lang_dir}_unk + if [ -f ${lang_dir}/G.fst ]; then cp ${lang_dir}/G.fst ${lang_dir}_unk/G.fst; fi + if [ -f ${lang_dir}/G.carpa ]; then cp ${lang_dir}/G.carpa ${lang_dir}_unk/G.carpa; fi +done exit 0 -utils/mkgraph.sh data/lang_unk exp/tri3 exp/tri3/graph_unk - -. ./cmd.sh - ## Caution: if you use this unk-model stuff, be sure that the scoring script ## does not use lattice-align-words-lexicon, because it's not compatible with ## the unk-model. Instead you should use lattice-align-words (of course, this ## only works if you have position-dependent phones). - -decode_nj=30 -for dset in dev test; do - steps/decode_fmllr.sh --nj $decode_nj --cmd "$decode_cmd" --num-threads 4 \ - exp/tri3/graph_unk data/${dset} exp/tri3/decode_${dset}_unk - steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \ - data/${dset} exp/tri3/decode_${dset}_unk exp/tri3/decode_${dset}_unk_rescore -done - -# # for x in exp/tri3/decode*; do grep Sum $x/*/*ys | utils/best_wer.sh ; done | grep -v old | grep -v si - -# # dev results. unk-model helps slightly before rescoring. -# %WER 19.3 | 507 17783 | 83.7 11.6 4.7 3.0 19.3 91.5 | -0.076 | exp/tri3/decode_dev/score_17_0.0/ctm.filt.filt.sys -# %WER 18.2 | 507 17783 | 84.8 10.7 4.5 3.0 18.2 91.3 | -0.111 | exp/tri3/decode_dev_rescore/score_16_0.0/ctm.filt.filt.sys -# %WER 19.1 | 507 17783 | 83.7 11.3 5.1 2.8 19.1 91.9 | -0.044 | exp/tri3/decode_dev_unk/score_17_0.0/ctm.filt.filt.sys -# %WER 18.2 | 507 17783 | 84.5 10.6 4.9 2.8 18.2 91.5 | -0.047 | exp/tri3/decode_dev_unk_rescore/score_15_0.0/ctm.filt.filt.sys - - -# # dev results. unk-model helps slightly after rescoring. -# %WER 17.3 | 1155 27500 | 85.0 11.5 3.5 2.4 17.3 86.9 | -0.035 | exp/tri3/decode_test/score_15_0.0/ctm.filt.filt.sys -# %WER 16.6 | 1155 27500 | 85.8 11.0 3.2 2.4 16.6 86.4 | -0.098 | exp/tri3/decode_test_rescore/score_14_0.0/ctm.filt.filt.sys -# %WER 17.3 | 1155 27500 | 84.9 11.3 3.8 2.2 17.3 87.4 | -0.015 | exp/tri3/decode_test_unk/score_15_0.0/ctm.filt.filt.sys -# %WER 16.5 | 1155 27500 | 85.7 10.7 3.6 2.2 16.5 86.7 | -0.075 | exp/tri3/decode_test_unk_rescore/score_14_0.0/ctm.filt.filt.sys diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_e.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_e.sh index d1085716430..207dd5b40f7 100755 --- a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_e.sh +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_e.sh @@ -200,7 +200,7 @@ if [ $stage -le 15 ]; then steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ - $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}${decode_suff} || exit 1; + $graph_dir data/${decode_set}_hires $dir/decode_poco_${decode_set}${decode_iter:+_$decode_iter}${decode_suff} || exit 1; ) & done fi diff --git a/egs/fisher_english/s5/local/semisup/run_100k.sh b/egs/fisher_english/s5/local/semisup/run_100k.sh index 05e6622c582..8458939cf3c 100644 --- a/egs/fisher_english/s5/local/semisup/run_100k.sh +++ b/egs/fisher_english/s5/local/semisup/run_100k.sh @@ -14,7 +14,11 @@ train_stage=-10 set -o pipefail exp=exp/semisup_100k -false && { +if [ ! -f data/train_sup/utt2spk ]; then + echo "$0: Could not find data/train_sup/utt2spk" + exit 1 +fi + utils/subset_data_dir.sh --shortest data/train_sup 100000 data/train_sup_100kshort utils/subset_data_dir.sh data/train_sup_100kshort 10000 data/train_sup_10k utils/data/remove_dup_utts.sh 100 data/train_sup_10k data/train_sup_10k_nodup @@ -71,30 +75,34 @@ steps/train_sat.sh --cmd "$train_cmd" \ utils/copy_data_dir.sh data/train_unsup250k data/train_unsup100k_250k utils/combine_data.sh data/semisup100k_250k data/train_sup \ data/train_unsup100k_250k || exit 1 -} + +if [ ! -f data/lang_test_poco_sup100k_big/G.carpa ]; then + local/fisher_train_lms_pocolm.sh \ + --text data/train_sup/text \ + --dir data/local/lm_sup100k + + local/fisher_create_test_lang.sh \ + --arpa-lm data/local/pocolm_sup100k/data/arpa/4gram_small.arpa.gz \ + --dir data/lang_test_poco_sup100k + + utils/build_const_arpa_lm.sh \ + data/local/pocolm_sup100k/data/arpa/4gram_big.arpa.gz \ + data/lang_test_poco_sup100k data/lang_test_poco_sup100k_big +fi + +local/run_unk_model.sh --lang-dirs "data/lang_test_poco_sup100k_big data/lang_test_poco_sup100k" || exit 1 local/semisup/chain/tuning/run_tdnn_100k.sh \ --train-set train_sup \ --stage $stage --train-stage $train_stage \ --exp $exp \ - --ivector-train-set train_sup || exit 1 + --ivector-train-set "" || exit 1 -local/fisher_train_lms.sh --text data/train_sup/text \ - --dir data/local/lm_sup100k - -local/fisher_create_test_lang.sh \ - --arpa-lm data/local/lm_sup100k/3gram-mincount/lm_unpruned.gz \ - --dir data/lang_test_sup100k - -utils/build_const_arpa_lm.sh \ - data/local/lm_sup100k/4gram-mincount/lm_unpruned.gz \ - data/lang_test_sup100k data/lang_test_sup100k_fg - -false && local/semisup/chain/tuning/run_tdnn_oracle.sh \ +local/semisup/chain/tuning/run_tdnn_oracle.sh \ --train-set train_sup \ --nnet3-affix \ --chain-affix \ + --gmm tri4a \ --stage 9 --train-stage $train_stage \ --exp $exp \ - --ivector-train-set semisup15k_250k || exit 1 - + --ivector-train-set "" || exit 1 diff --git a/egs/fisher_english/s5/local/semisup/run_15k.sh b/egs/fisher_english/s5/local/semisup/run_15k.sh index 41590dd9fe2..f64ea6221c0 100644 --- a/egs/fisher_english/s5/local/semisup/run_15k.sh +++ b/egs/fisher_english/s5/local/semisup/run_15k.sh @@ -14,7 +14,13 @@ train_stage=-10 set -o pipefail exp=exp/semisup_15k -false && { +for f in data/train_sup/utt2spk data/train_unsup250k/utt2spk ]; do + if [ ! -f $f ]; then + echo "$0: Could not find $f" + exit 1 + fi +done + utils/subset_data_dir.sh --speakers data/train_sup 15000 data/train_sup15k || exit 1 utils/subset_data_dir.sh --shortest data/train_sup15k 5000 data/train_sup15k_short || exit 1 utils/subset_data_dir.sh data/train_sup15k 7500 data/train_sup15k_half || exit 1 @@ -56,6 +62,27 @@ steps/train_sat.sh --cmd "$train_cmd" \ utils/combine_data.sh data/semisup15k_250k data/train_sup15k data/train_unsup250k || exit 1 +mkdir -p data/local/pocolm_ex250k + +utils/filter_scp.pl --exclude data/train_unsup250k/utt2spk \ + data/train/text > data/local/pocolm_ex250k/text.tmp + +if [ ! -f data/lang_test_poco_ex250k_big/G.carpa ]; then + local/fisher_train_lms_pocolm.sh \ + --text data/local/pocolm_ex250k/text.tmp \ + --dir data/local/pocolm_ex250k + + local/fisher_create_test_lang.sh \ + --arpa-lm data/local/pocolm_ex250k/data/arpa/4gram_small.arpa.gz \ + --dir data/lang_test_poco_ex250k + + utils/build_const_arpa_lm.sh \ + data/local/pocolm_ex250k/data/arpa/4gram_big.arpa.gz \ + data/lang_test_poco_ex250k data/lang_test_poco_ex250k_big +fi + +local/run_unk_model.sh --lang-dirs "data/lang_test_poco_ex250k_big data/lang_test_poco_ex250k" || exit 1 + local/semisup/chain/tuning/run_tdnn_11k.sh \ --train-set train_sup15k \ --nnet3-affix _semi15k_250k \ @@ -63,12 +90,12 @@ local/semisup/chain/tuning/run_tdnn_11k.sh \ --stage $stage --train-stage $train_stage \ --exp $exp \ --ivector-train-set semisup15k_250k || exit 1 -} -false && local/semisup/chain/tuning/run_tdnn_oracle.sh \ +local/semisup/chain/tuning/run_tdnn_oracle.sh \ --train-set semisup15k_250k \ --nnet3-affix _semi15k_250k \ --chain-affix _semi15k_250k_oracle \ + --gmm tri3 \ --stage 9 --train-stage $train_stage \ --exp $exp \ --ivector-train-set semisup15k_250k || exit 1 diff --git a/egs/fisher_english/s5/local/semisup/run_50k.sh b/egs/fisher_english/s5/local/semisup/run_50k.sh index e69997cba2c..6a1ca07f6f3 100644 --- a/egs/fisher_english/s5/local/semisup/run_50k.sh +++ b/egs/fisher_english/s5/local/semisup/run_50k.sh @@ -14,6 +14,13 @@ train_stage=-10 set -o pipefail exp=exp/semisup_50k +for f in data/train_sup/utt2spk data/train_unsup250k/utt2spk ]; do + if [ ! -f $f ]; then + echo "$0: Could not find $f" + exit 1 + fi +done + utils/subset_data_dir.sh --speakers data/train_sup 50000 data/train_sup50k || exit 1 utils/subset_data_dir.sh --shortest data/train_sup50k 25000 data/train_sup50k_short || exit 1 utils/subset_data_dir.sh --speakers data/train_sup50k 30000 data/train_sup50k_30k || exit 1; @@ -65,6 +72,27 @@ steps/train_sat.sh --cmd "$train_cmd" \ utils/combine_data.sh data/semisup50k_250k data/train_sup50k data/train_unsup250k || exit 1 +mkdir -p data/local/pocolm_ex250k + +utils/filter_scp.pl --exclude data/train_unsup250k/utt2spk \ + data/train/text > data/local/pocolm_ex250k/text.tmp + +if [ ! -f data/lang_test_poco_ex250k_big/G.carpa ]; then + local/fisher_train_lms_pocolm.sh \ + --text data/local/pocolm_ex250k/text.tmp \ + --dir data/local/pocolm_ex250k + + local/fisher_create_test_lang.sh \ + --arpa-lm data/local/pocolm_ex250k/data/arpa/4gram_small.arpa.gz \ + --dir data/lang_test_poco_ex250k + + utils/build_const_arpa_lm.sh \ + data/local/pocolm_ex250k/data/arpa/4gram_big.arpa.gz \ + data/lang_test_poco_ex250k data/lang_test_poco_ex250k_big +fi + +local/run_unk_model.sh --lang-dirs "data/lang_test_poco_ex250k_big data/lang_test_poco_ex250k" || exit 1 + local/semisup/chain/tuning/run_tdnn_50k.sh \ --train-set train_sup50k \ --nnet3-affix _semi50k_250k \ From 42e9065e827dc4e5441713c012facfeafd59131a Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Wed, 6 Dec 2017 13:35:57 -0500 Subject: [PATCH 105/174] semisup-smbr: Add more recipes with UNK model --- ...un_tdnn_100k_250k_semisupervised_conf_a.sh | 482 ++++++++++++++++++ ...un_tdnn_100k_250k_semisupervised_conf_b.sh | 482 ++++++++++++++++++ .../semisup/chain/tuning/run_tdnn_100k_d.sh | 199 ++++++++ .../semisup/chain/tuning/run_tdnn_100k_e.sh | 198 +++++++ .../semisup/chain/tuning/run_tdnn_100k_f.sh | 199 ++++++++ .../run_tdnn_15k_semisupervised_conf_ap.sh | 471 +++++++++++++++++ .../run_tdnn_15k_semisupervised_conf_aq.sh | 472 +++++++++++++++++ .../semisup/chain/tuning/run_tdnn_50k_d.sh | 2 +- .../run_tdnn_50k_semisupervised_conf_l.sh | 2 +- .../run_tdnn_50k_semisupervised_conf_m.sh | 2 +- .../run_tdnn_50k_semisupervised_conf_n.sh | 6 +- .../run_tdnn_50k_semisupervised_conf_o.sh | 466 +++++++++++++++++ .../run_tdnn_50k_semisupervised_conf_p.sh | 472 +++++++++++++++++ .../run_tdnn_50k_semisupervised_conf_q.sh | 468 +++++++++++++++++ 14 files changed, 3915 insertions(+), 6 deletions(-) create mode 100644 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_250k_semisupervised_conf_a.sh create mode 100644 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_250k_semisupervised_conf_b.sh create mode 100755 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_d.sh create mode 100755 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_e.sh create mode 100755 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_f.sh create mode 100644 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_ap.sh create mode 100644 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_aq.sh create mode 100755 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_o.sh create mode 100755 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_p.sh create mode 100755 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_q.sh diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_250k_semisupervised_conf_a.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_250k_semisupervised_conf_a.sh new file mode 100644 index 00000000000..b1b29be6026 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_250k_semisupervised_conf_a.sh @@ -0,0 +1,482 @@ +#!/bin/bash + +# Unsupervised set: train_unsup100k_250k +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 3,2 +# LM for decoding unsupervised data: 3gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +exp=exp/semisup_100k + +supervised_set=train_sup +unsupervised_set=train_unsup100k_250k +semisup_train_set= # semisup100k_250k + +tdnn_affix=7f # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +nnet3_affix= # affix for nnet3 and chain dir -- relates to i-vector used + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg=150 # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb_250k_1a # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=3,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix=bi_f +unsup_egs_opts= +apply_deriv_weights=true +use_smart_splitting=true + +do_finetuning=false + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size="150=128/300=64" +# to tune: +# frames_per_eg for unsupervised + +decode_iter= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +if $use_smart_splitting; then + comb_affix=${comb_affix:+${comb_affix}_smart} +else + comb_affix=${comb_affix:+${comb_affix}_naive} +fi + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}_sp${decode_affix}/frame_subsampling_factor +fi + +frame_subsampling_factor=1 +if [ -f $chaindir/frame_subsampling_factor ]; then + frame_subsampling_factor=`cat $chaindir/frame_subsampling_factor` +fi +cmvn_opts=`cat $chaindir/cmvn_opts` || exit 1 + +sup_ali_dir=$exp/tri4a + +treedir=$exp/chain${nnet3_affix}/tree_${tree_affix} +if [ ! -f $treedir/final.mdl ]; then + echo "$0: $treedir/final.mdl does not exist." + exit 1 +fi + +diff $treedir/tree $chaindir/tree || { echo "$0: $treedir/tree and $chaindir/tree differ"; exit 1; } + +dir=$exp/chain${nnet3_affix}/tdnn${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} + +#if [ $stage -le 9 ]; then +# steps/subset_ali_dir.sh --cmd "$train_cmd" \ +# data/${unsupervised_set} data/${unsupervised_set}_sp_hires \ +# $chaindir/best_path_${unsupervised_set}_sp${decode_affix} \ +# $chaindir/best_path_${unsupervised_set}${decode_affix} +# echo $frame_subsampling_factor > $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +#fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}_sp${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri4a_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} + +if [ -z "$unsup_egs_dir" ]; then + [ -z $unsup_frames_per_eg ] && [ ! -z "$frames_per_eg" ] && unsup_frames_per_eg=$frames_per_eg + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + if $use_smart_splitting; then + get_egs_script=steps/nnet3/chain/get_egs_split.sh + else + get_egs_script=steps/nnet3/chain/get_egs.sh + fi + + $get_egs_script --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph${test_graph_affix} +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 $test_lang $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${test_graph_affix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_250k_semisupervised_conf_b.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_250k_semisupervised_conf_b.sh new file mode 100644 index 00000000000..cd76a4f4e76 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_250k_semisupervised_conf_b.sh @@ -0,0 +1,482 @@ +#!/bin/bash + +# Unsupervised set: train_unsup100k_250k +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 3,2 +# LM for decoding unsupervised data: 3gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +exp=exp/semisup_100k + +supervised_set=train_sup +unsupervised_set=train_unsup100k_250k +semisup_train_set= # semisup100k_250k + +tdnn_affix=7d # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +nnet3_affix= # affix for nnet3 and chain dir -- relates to i-vector used + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg=150 # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb_250k_1b # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=3,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix=bi_d +unsup_egs_opts= +apply_deriv_weights=true +use_smart_splitting=true + +do_finetuning=false + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size="150=128/300=64" +# to tune: +# frames_per_eg for unsupervised + +decode_iter= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +if $use_smart_splitting; then + comb_affix=${comb_affix:+${comb_affix}_smart} +else + comb_affix=${comb_affix:+${comb_affix}_naive} +fi + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}_sp${decode_affix}/frame_subsampling_factor +fi + +frame_subsampling_factor=1 +if [ -f $chaindir/frame_subsampling_factor ]; then + frame_subsampling_factor=`cat $chaindir/frame_subsampling_factor` +fi +cmvn_opts=`cat $chaindir/cmvn_opts` || exit 1 + +sup_ali_dir=$exp/tri4a + +treedir=$exp/chain${nnet3_affix}/tree_${tree_affix} +if [ ! -f $treedir/final.mdl ]; then + echo "$0: $treedir/final.mdl does not exist." + exit 1 +fi + +diff $treedir/tree $chaindir/tree || { echo "$0: $treedir/tree and $chaindir/tree differ"; exit 1; } + +dir=$exp/chain${nnet3_affix}/tdnn${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} + +#if [ $stage -le 9 ]; then +# steps/subset_ali_dir.sh --cmd "$train_cmd" \ +# data/${unsupervised_set} data/${unsupervised_set}_sp_hires \ +# $chaindir/best_path_${unsupervised_set}_sp${decode_affix} \ +# $chaindir/best_path_${unsupervised_set}${decode_affix} +# echo $frame_subsampling_factor > $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +#fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}_sp${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri4a_${supervised_set}_unk_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} + +if [ -z "$unsup_egs_dir" ]; then + [ -z $unsup_frames_per_eg ] && [ ! -z "$frames_per_eg" ] && unsup_frames_per_eg=$frames_per_eg + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + if $use_smart_splitting; then + get_egs_script=steps/nnet3/chain/get_egs_split.sh + else + get_egs_script=steps/nnet3/chain/get_egs.sh + fi + + $get_egs_script --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph${test_graph_affix} +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 $test_lang $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${test_graph_affix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_d.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_d.sh new file mode 100755 index 00000000000..4965e0d4dcb --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_d.sh @@ -0,0 +1,199 @@ +#!/bin/bash +set -e + +# This is fisher chain recipe for training a model on a subset of around 100 hours. +# This is similar to _b, but uses a bi-phone tree with 7000 leaves + +# configs for 'chain' +stage=0 +tdnn_affix=7d +train_stage=-10 +get_egs_stage=-10 +decode_iter= +train_set=train_sup +ivector_train_set=train_sup +tree_affix=bi_d +nnet3_affix= +chain_affix= +exp=exp/semisup_100k +gmm=tri4a +xent_regularize=0.1 +hidden_dim=725 + +# training options +num_epochs=4 +remove_egs=false +common_egs_dir= +minibatch_size=128 + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate -1 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 7000 data/${train_set}_sp $lang $lat_dir $treedir || exit 1 +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + mkdir -p $dir/egs + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$common_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize 0.1 \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true" \ + --egs.chunk-width 160,140,110,80 \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph_poco_unk +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_poco_test_unk $dir $graph_dir +fi + +decode_suff= +if [ $stage -le 15 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}${decode_suff} || exit 1; + ) & + done +fi +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_e.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_e.sh new file mode 100755 index 00000000000..36f9107039b --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_e.sh @@ -0,0 +1,198 @@ +#!/bin/bash +set -e + +# This is fisher chain recipe for training a model on a subset of around 100 hours. +# This is similar to _b, but uses a bi-phone tree with 7000 leaves + +# configs for 'chain' +stage=0 +tdnn_affix=7e +train_stage=-10 +get_egs_stage=-10 +decode_iter= +train_set=train_sup +ivector_train_set=train_sup +tree_affix=bi_e +nnet3_affix= +chain_affix= +exp=exp/semisup_100k +gmm=tri4a +xent_regularize=0.1 +hidden_dim=725 + +# training options +num_epochs=4 +remove_egs=false +common_egs_dir= +minibatch_size=128 + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate -1 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 7000 data/${train_set} $lang $gmm_dir $treedir || exit 1 +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + mkdir -p $dir/egs + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$common_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize 0.1 \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true" \ + --egs.chunk-width 160,140,110,80 \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph_poco +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_poco_test $dir $graph_dir +fi + +decode_suff= +if [ $stage -le 15 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}${decode_suff} || exit 1; + ) & + done +fi +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_f.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_f.sh new file mode 100755 index 00000000000..dca562e7c20 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_f.sh @@ -0,0 +1,199 @@ +#!/bin/bash +set -e + +# This is fisher chain recipe for training a model on a subset of around 100 hours. +# This is similar to _b, but uses a bi-phone tree with 7000 leaves + +# configs for 'chain' +stage=0 +tdnn_affix=7f +train_stage=-10 +get_egs_stage=-10 +decode_iter= +train_set=train_sup +ivector_train_set=train_sup +tree_affix=bi_d +nnet3_affix= +chain_affix= +exp=exp/semisup_100k +gmm=tri4a +xent_regularize=0.1 +hidden_dim=725 + +# training options +num_epochs=4 +remove_egs=false +common_egs_dir= +minibatch_size=128 + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate -1 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 7000 data/${train_set}_sp $lang $lat_dir $treedir || exit 1 +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + mkdir -p $dir/egs + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$common_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize 0.1 \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true" \ + --egs.chunk-width 160,140,110,80 \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph_poco +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_poco_test $dir $graph_dir +fi + +decode_suff= +if [ $stage -le 15 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}${decode_suff} || exit 1; + ) & + done +fi +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_ap.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_ap.sh new file mode 100644 index 00000000000..8b94b46f3bb --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_ap.sh @@ -0,0 +1,471 @@ +#!/bin/bash + +# This script is same as _ao, but uses creates denominator FST using speed-perturbed data. +# sup_frames_per_eg=160,140,110,80 +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 5,2 +# LM for decoding unsupervised data: 4gram +# Supervision: Smart split lattices + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup15k_250k # for reference +exp=exp/semisup_15k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup15k +semi_affix=semi15k_250k # affix relating train-set splitting proportion + +tdnn_affix=7i # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix=_unphdet +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg=150 # if empty will be equal to the supervised model's config unless sup_frames_per_eg + # -- you will need to change minibatch_size for comb training accordingly +sup_frames_per_eg=160,140,110,80 +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1ap # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix=bi_i +unsup_egs_opts= +apply_deriv_weights=true +use_smart_splitting=true +do_finetuning=false + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size="150=128/300=64" +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}_sp${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri3_${supervised_set}_unk_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + [ ! -z "$sup_frames_per_eg" ] && frames_per_eg=$sup_frames_per_eg + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + if $use_smart_splitting; then + get_egs_script=steps/nnet3/chain/get_egs_split.sh + else + get_egs_script=steps/nnet3/chain/get_egs.sh + fi + + echo "$0: generating egs from the unsupervised data" + $get_egs_script --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +test_graph_dir=$dir/graph${test_graph_affix} +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 ${test_lang} $dir $test_graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $test_graph_dir data/${decode_set}_hires \ + $dir/decode${test_graph_affix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $test_graph_dir data/${decode_set}_hires \ + $dir/decode${test_graph_affix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_aq.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_aq.sh new file mode 100644 index 00000000000..7cd4f890d6a --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_15k_semisupervised_conf_aq.sh @@ -0,0 +1,472 @@ +#!/bin/bash + +# This script is same as _ao, but uses smaller weight on supervised phone alignments +# because they are from speed perturbed data. +# sup_frames_per_eg=160,140,110,80 +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 5,6 +# LM for decoding unsupervised data: 4gram +# Supervision: Smart split lattices + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup15k_250k # for reference +exp=exp/semisup_15k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup15k +semi_affix=semi15k_250k # affix relating train-set splitting proportion + +tdnn_affix=7i # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix=_unphdet +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg=150 # if empty will be equal to the supervised model's config unless sup_frames_per_eg + # -- you will need to change minibatch_size for comb training accordingly +sup_frames_per_eg=160,140,110,80 +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1aq # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=5,6 +sup_egs_dir= +unsup_egs_dir= +tree_affix=bi_i +unsup_egs_opts= +apply_deriv_weights=true +use_smart_splitting=true +do_finetuning=false + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size="150=128/300=64" +# to tune: +# frames_per_eg for unsupervised + +decode_iter= +lang_test_suffix= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri3_${supervised_set}_unk_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + [ ! -z "$sup_frames_per_eg" ] && frames_per_eg=$sup_frames_per_eg + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + if $use_smart_splitting; then + get_egs_script=steps/nnet3/chain/get_egs_split.sh + else + get_egs_script=steps/nnet3/chain/get_egs.sh + fi + + echo "$0: generating egs from the unsupervised data" + $get_egs_script --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +test_graph_dir=$dir/graph${test_graph_affix} +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 ${test_lang} $dir $test_graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $test_graph_dir data/${decode_set}_hires \ + $dir/decode${test_graph_affix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $test_graph_dir data/${decode_set}_hires \ + $dir/decode${test_graph_affix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_d.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_d.sh index b2fb59e4fed..7c008d0d879 100755 --- a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_d.sh +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_d.sh @@ -200,7 +200,7 @@ if [ $stage -le 15 ]; then steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ - $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}${decode_suff} || exit 1; + $graph_dir data/${decode_set}_hires $dir/decode_poco_unk_${decode_set}${decode_iter:+_$decode_iter}${decode_suff} || exit 1; ) & done fi diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_l.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_l.sh index a4971398133..f93aebb4027 100755 --- a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_l.sh +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_l.sh @@ -27,7 +27,7 @@ train_supervised_opts="--stage -10 --train-stage -10" # Unsupervised options decode_affix=_undet egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir -unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +unsup_frames_per_eg=150 # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data tolerance=1 diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_m.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_m.sh index 08b78f675f2..f85a34660bd 100755 --- a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_m.sh +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_m.sh @@ -27,7 +27,7 @@ train_supervised_opts="--stage -10 --train-stage -10" # Unsupervised options decode_affix=_undet egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir -unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +unsup_frames_per_eg=150 # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data tolerance=1 diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_n.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_n.sh index aab0d7a2c6a..1b2614c660c 100755 --- a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_n.sh +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_n.sh @@ -27,7 +27,7 @@ train_supervised_opts="--stage -10 --train-stage -10" # Unsupervised options decode_affix=_undet egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir -unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +unsup_frames_per_eg=150 # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data tolerance=1 @@ -233,7 +233,7 @@ if [ $stage -le 11 ]; then # similar in the xent and regular final layers. relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 - + output name=output-0 input=output.affine skip-in-init=true output name=output-1 input=output.affine skip-in-init=true @@ -411,7 +411,7 @@ fi if [ $stage -le 19 ]; then mkdir -p ${dir}${finetune_suffix} - + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 done diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_o.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_o.sh new file mode 100755 index 00000000000..82da071e0bc --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_o.sh @@ -0,0 +1,466 @@ +#!/bin/bash + +# This script is same as _k, but uses biphone tree. +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 3,2 +# LM for decoding unsupervised data: 4gram +# Supervision: Smart split lattices + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup50k_250k # for reference +exp=exp/semisup_50k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup50k +semi_affix=semi50k_250k # affix relating train-set splitting proportion + +tdnn_affix=7d # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix=_undet +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg=150 # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1o # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=3,2 +sup_egs_dir= +unsup_egs_dir= +comb_egs_dir= +tree_affix=bi_d +unsup_egs_opts= +apply_deriv_weights=true +use_smart_splitting=true + +do_finetuning=false + +extra_left_context=0 +extra_right_context=0 + +train_extra_opts= + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size="150=128/300=64" +# to tune: +# frames_per_eg for unsupervised + +decode_iter= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +if $use_smart_splitting; then + comb_affix=${comb_affix:+${comb_affix}_smart} +else + comb_affix=${comb_affix:+${comb_affix}_naive} +fi + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +fi + +frame_subsampling_factor=1 +if [ -f $chaindir/frame_subsampling_factor ]; then + frame_subsampling_factor=`cat $chaindir/frame_subsampling_factor` +fi +cmvn_opts=`cat $chaindir/cmvn_opts` || exit 1 + +sup_ali_dir=$exp/tri4a + +treedir=$exp/chain${nnet3_affix}/tree_${tree_affix} +if [ ! -f $treedir/final.mdl ]; then + echo "$0: $treedir/final.mdl does not exist." + exit 1 +fi + +diff $treedir/tree $chaindir/tree || { echo "$0: $treedir/tree and $chaindir/tree differ"; exit 1; } + +dir=$exp/chain${nnet3_affix}/tdnn${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} + + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}_sp${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri4a_${supervised_set}_unk_lats +if [ -z "$comb_egs_dir" ] && [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + if [ -z "$comb_egs_dir" ]; then + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) + fi +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} + +if [ -z "$comb_egs_dir" ] && [ -z "$unsup_egs_dir" ]; then + [ -z $unsup_frames_per_eg ] && [ ! -z "$frames_per_eg" ] && unsup_frames_per_eg=$frames_per_eg + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + if $use_smart_splitting; then + get_egs_script=steps/nnet3/chain/get_egs_split.sh + else + get_egs_script=steps/nnet3/chain/get_egs.sh + fi + + $get_egs_script --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +if [ -z "$comb_egs_dir" ]; then + comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + + if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. + fi +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width `cat $comb_egs_dir/info/frames_per_eg` \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir ${train_extra_opts} || exit 1; +fi + +graph_dir=$dir/graph${test_graph_affix} +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 $test_lang $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${test_graph_affix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_p.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_p.sh new file mode 100755 index 00000000000..db989b65eb5 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_p.sh @@ -0,0 +1,472 @@ +#!/bin/bash + +# This script is same as _k, but uses biphone tree. +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 3,2 +# LM for decoding unsupervised data: 4gram +# Supervision: Smart split lattices + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup50k_250k # for reference +exp=exp/semisup_50k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup50k +semi_affix=semi50k_250k # affix relating train-set splitting proportion + +tdnn_affix=7d # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix=_undet +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg=150 # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1p # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=3,6 +sup_egs_dir= +unsup_egs_dir= +comb_egs_dir= +tree_affix=bi_d +unsup_egs_opts= +apply_deriv_weights=true +use_smart_splitting=true + +do_finetuning=false + +extra_left_context=0 +extra_right_context=0 + +train_extra_opts= + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size="150=128/300=64" +# to tune: +# frames_per_eg for unsupervised + +decode_iter= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +if $use_smart_splitting; then + comb_affix=${comb_affix:+${comb_affix}_smart} +else + comb_affix=${comb_affix:+${comb_affix}_naive} +fi + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri4a_${supervised_set}_unk_lats +if [ -z "$comb_egs_dir" ] && [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + if [ -z "$comb_egs_dir" ]; then + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) + fi +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} + +if [ -z "$comb_egs_dir" ] && [ -z "$unsup_egs_dir" ]; then + [ -z $unsup_frames_per_eg ] && [ ! -z "$frames_per_eg" ] && unsup_frames_per_eg=$frames_per_eg + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + if $use_smart_splitting; then + get_egs_script=steps/nnet3/chain/get_egs_split.sh + else + get_egs_script=steps/nnet3/chain/get_egs.sh + fi + + $get_egs_script --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +if [ -z "$comb_egs_dir" ]; then + comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + + if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. + fi +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width `cat $comb_egs_dir/info/frames_per_eg` \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir ${train_extra_opts} || exit 1; +fi + +graph_dir=$dir/graph${test_graph_affix} +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 $test_lang $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${test_graph_affix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_q.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_q.sh new file mode 100755 index 00000000000..0dc1ef33e03 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_conf_q.sh @@ -0,0 +1,468 @@ +#!/bin/bash + +# This script is same as _p, but does not use phone UNK model +# Also the same as _n, but uses speed-perturbed data to get +# appropriate weights for phone LM. +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 3,2 +# LM for decoding unsupervised data: 4gram +# Supervision: Smart split lattices + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup50k_250k # for reference +exp=exp/semisup_50k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup50k +semi_affix=semi50k_250k # affix relating train-set splitting proportion + +tdnn_affix=7e # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix=_undet +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg=150 # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1q # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=3,2 +sup_egs_dir= +unsup_egs_dir= +comb_egs_dir= +tree_affix=bi_e +unsup_egs_opts= +apply_deriv_weights=true +use_smart_splitting=true + +do_finetuning=false + +extra_left_context=0 +extra_right_context=0 + +train_extra_opts= + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size="150=128/300=64" +# to tune: +# frames_per_eg for unsupervised + +decode_iter= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +if $use_smart_splitting; then + comb_affix=${comb_affix:+${comb_affix}_smart} +else + comb_affix=${comb_affix:+${comb_affix}_naive} +fi + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}_sp${decode_affix}/frame_subsampling_factor +fi + +frame_subsampling_factor=1 +if [ -f $chaindir/frame_subsampling_factor ]; then + frame_subsampling_factor=`cat $chaindir/frame_subsampling_factor` +fi +cmvn_opts=`cat $chaindir/cmvn_opts` || exit 1 + +sup_ali_dir=$exp/tri4a + +treedir=$exp/chain${nnet3_affix}/tree_${tree_affix} +if [ ! -f $treedir/final.mdl ]; then + echo "$0: $treedir/final.mdl does not exist." + exit 1 +fi + +diff $treedir/tree $chaindir/tree || { echo "$0: $treedir/tree and $chaindir/tree differ"; exit 1; } + +dir=$exp/chain${nnet3_affix}/tdnn${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} + + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}_sp${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri4a_${supervised_set}_lats +if [ -z "$comb_egs_dir" ] && [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + if [ -z "$comb_egs_dir" ]; then + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) + fi +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} + +if [ -z "$comb_egs_dir" ] && [ -z "$unsup_egs_dir" ]; then + [ -z $unsup_frames_per_eg ] && [ ! -z "$frames_per_eg" ] && unsup_frames_per_eg=$frames_per_eg + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + if $use_smart_splitting; then + get_egs_script=steps/nnet3/chain/get_egs_split.sh + else + get_egs_script=steps/nnet3/chain/get_egs.sh + fi + + $get_egs_script --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +if [ -z "$comb_egs_dir" ]; then + comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + + if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. + fi +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width `cat $comb_egs_dir/info/frames_per_eg` \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir ${train_extra_opts} || exit 1; +fi + +graph_dir=$dir/graph${test_graph_affix} +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 $test_lang $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${test_graph_affix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + From df0913327878801b1b8e1c3301ec8becc5d82a9d Mon Sep 17 00:00:00 2001 From: Nagendra Kumar Goel Date: Tue, 9 Jan 2018 17:05:01 -0500 Subject: [PATCH 106/174] SWBD stats pooling VAD recipe --- .../s5c/local/run_cleanup_segmentation.sh | 56 ++++++++++++++ .../local/segmentation/copy_targets_dir.sh | 76 +++++++++++++++++++ 2 files changed, 132 insertions(+) create mode 100755 egs/swbd/s5c/local/run_cleanup_segmentation.sh create mode 100755 egs/swbd/s5c/local/segmentation/copy_targets_dir.sh diff --git a/egs/swbd/s5c/local/run_cleanup_segmentation.sh b/egs/swbd/s5c/local/run_cleanup_segmentation.sh new file mode 100755 index 00000000000..d08d3f0e0b4 --- /dev/null +++ b/egs/swbd/s5c/local/run_cleanup_segmentation.sh @@ -0,0 +1,56 @@ +#!/bin/bash + +# Copyright 2016 Vimal Manohar +# 2016 Johns Hopkins University (author: Daniel Povey) +# Apache 2.0 + +# This script demonstrates how to re-segment training data selecting only the +# "good" audio that matches the transcripts. +# The basic idea is to decode with an existing in-domain acoustic model, and a +# biased language model built from the reference, and then work out the +# segmentation from a ctm like file. + +# For nnet3 and chain results after cleanup, see the scripts in +# local/nnet3/run_tdnn.sh and local/chain/run_tdnn.sh + +# GMM Results for speaker-independent (SI) and speaker adaptive training (SAT) systems on dev and test sets +# [will add these later]. + +set -e +set -o pipefail +set -u + +stage=0 +cleanup_stage=0 +data=data/train +cleanup_affix=cleaned +srcdir=exp/tri4_mmi_b0.1 +langdir=data/lang_sw1_tg +nj=100 +decode_nj=16 +decode_num_threads=4 + +. ./path.sh +. ./cmd.sh +. utils/parse_options.sh + +cleaned_data=${data}_${cleanup_affix} + +dir=${srcdir}_${cleanup_affix}_work +cleaned_dir=${srcdir}_${cleanup_affix} + +if [ $stage -le 1 ]; then + # This does the actual data cleanup. + steps/cleanup/clean_and_segment_data.sh --stage $cleanup_stage --nj $nj --cmd "$train_cmd" \ + $data $langdir $srcdir $dir $cleaned_data +fi + +if [ $stage -le 2 ]; then + steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \ + $cleaned_data $langdir $srcdir ${srcdir}_ali_${cleanup_affix} +fi + +if [ $stage -le 3 ]; then + steps/train_sat.sh --cmd "$train_cmd" \ + 5000 100000 $cleaned_data $langdir ${srcdir}_ali_${cleanup_affix} ${cleaned_dir} +fi diff --git a/egs/swbd/s5c/local/segmentation/copy_targets_dir.sh b/egs/swbd/s5c/local/segmentation/copy_targets_dir.sh new file mode 100755 index 00000000000..8be70b4715a --- /dev/null +++ b/egs/swbd/s5c/local/segmentation/copy_targets_dir.sh @@ -0,0 +1,76 @@ +#!/bin/bash + +# Copyright 2014 Johns Hopkins University (author: Nagendra K Goel) +# Apache 2.0 + +# This script operates on a directory, such as in exp/segmentation_1a/train_whole_combined_targets_rev1, +# that contains some subset of the following files: +# targets.X.ark +# frame_subsampling_factor +# It copies to another directory, possibly adding a specified prefix or a suffix +# to the utterance names. + + +# begin configuration section +utt_prefix= +utt_suffix= +cmd=run.pl +# end configuration section + +. utils/parse_options.sh + +if [ $# != 2 ]; then + echo "Usage: " + echo " $0 [options] " + echo "e.g.:" + echo " $0 --utt-prefix=1- exp/segmentation_1a/train_whole_combined_targets_sub3 exp/segmentation_1a/train_whole_combined_targets_sub3_rev1" + echo "Options" + echo " --utt-prefix= # Prefix for utterance ids, default empty" + echo " --utt-suffix= # Suffix for utterance ids, default empty" + exit 1; +fi + + +export LC_ALL=C + +src_dir=$1 +dest_dir=$2 + +mkdir -p $dest_dir + +if [ ! -f $src_dir/targets.1.ark ]; then + echo "copy_targets_dir.sh: no such files $src_dir/targets.1.ark" + exit 1; +fi + +for f in frame_subsampling_factor; do + if [ ! -f $src_dir/$f ]; then + echo "$0: no such file $src_dir/$f this might be serious error." + continue + fi + cp $src_dir/$f $dest_dir/ +done + +nj=$(ls $src_dir/targets.*.ark | wc -l) +mkdir -p $dest_dir/temp +cat << EOF > $dest_dir/temp/copy_targets.sh +set -e; +id=\$1 +echo "$src_dir/targets.\$id.ark" +copy-matrix ark:$src_dir/targets.\$id.ark ark,t:- | \ +python -c " +import sys +for line in sys.stdin: + parts = line.split() + if \"[\" not in line: + print line.rstrip() + else: + print '$utt_prefix{0}$utt_suffix {1}'.format(parts[0], ' '.join(parts[1:])) +" | \ + copy-matrix ark,t:- ark:$dest_dir/targets.\$id.ark || exit 1; +set +o pipefail; # unset the pipefail option. +EOF +chmod +x $dest_dir/temp/copy_targets.sh +$cmd -v PATH JOB=1:$nj $dest_dir/temp/copy_targets.JOB.log $dest_dir/temp/copy_targets.sh JOB || exit 1; + +echo "$0: copied targets from $src_dir to $dest_dir" From b9c7161fbaf6b37907a47e3dfa510c28b5c4abdd Mon Sep 17 00:00:00 2001 From: Nagendra Kumar Goel Date: Tue, 9 Jan 2018 17:14:38 -0500 Subject: [PATCH 107/174] Add SWBD VAD recipe --- egs/swbd/s5c/local/run_asr_segmentation.sh | 83 ++++++++++++++----- .../s5c/local/run_cleanup_segmentation.sh | 3 +- .../local/segmentation/copy_targets_dir.sh | 3 +- .../tuning/train_lstm_asr_sad_1a.sh | 7 +- .../tuning/train_stats_asr_sad_1a.sh | 9 +- 5 files changed, 77 insertions(+), 28 deletions(-) diff --git a/egs/swbd/s5c/local/run_asr_segmentation.sh b/egs/swbd/s5c/local/run_asr_segmentation.sh index 32b2e3a8411..d87703d1e90 100755 --- a/egs/swbd/s5c/local/run_asr_segmentation.sh +++ b/egs/swbd/s5c/local/run_asr_segmentation.sh @@ -1,15 +1,17 @@ -#! /bin/bash +#!/bin/bash -# Copyright 2017 Vimal Manohar +# Copyright 2017 Nagendra Kumar Goel +# 2017 Vimal Manohar # Apache 2.0 -# Features configs (Must match the features used to train the models -# $sat_model_dir and $model_dir) +# We assume the run-1-main.sh (because we are using model directories like +# exp/tri4) and later we assumme run-4-anydecode.sh was run to prepare +# data/dev10h.pem -lang=data/lang_nosp # Must match the one used to train the models +lang=data/lang # Must match the one used to train the models lang_test=data/lang_nosp_sw1_tg # Lang directory for decoding. -data_dir=data/train_100k_nodup +data_dir=data/train # Model directory used to align the $data_dir to get target labels for training # SAD. This should typically be a speaker-adapted system. sat_model_dir=exp/tri4 @@ -40,8 +42,8 @@ affix=_1a stage=-1 nj=80 -. ./path.sh -. ./cmd.sh +. path.sh +. cmd.sh set -e -u -o pipefail . utils/parse_options.sh @@ -55,7 +57,7 @@ mkdir -p $dir # See $lang/phones.txt and decide which should be garbage garbage_phones="lau spn" -silence_phones="nsn SIL" +silence_phones="sil" for p in $garbage_phones; do for affix in "" "_B" "_E" "_I" "_S"; do @@ -85,8 +87,10 @@ fi # Extract features for the whole data directory ############################################################################### if [ $stage -le 1 ]; then - steps/make_mfcc.sh --cmd "$train_cmd" --nj $nj --write-utt2num-frames true \ - ${whole_data_dir} || exit 1 + steps/make_mfcc.sh --nj 50 --cmd "$train_cmd" --write-utt2num-frames true \ + $whole_data_dir exp/make_mfcc/train_whole + steps/compute_cmvn_stats.sh $whole_data_dir exp/make_mfcc/train_whole + utils/fix_data_dir.sh $whole_data_dir fi ############################################################################### @@ -112,18 +116,27 @@ if [ $stage -le 3 ]; then fi if [ $stage -le 4 ]; then - utils/copy_data_dir.sh ${whole_data_dir} ${whole_data_dir}_hires_bp - steps/make_mfcc.sh --mfcc-config conf/mfcc_hires_bp.conf --nj 40 \ - ${whole_data_dir}_hires_bp - steps/compute_cmvn_stats.sh ${whole_data_dir}_hires_bp + utils/copy_data_dir.sh ${whole_data_dir} ${whole_data_dir}_hires + steps/make_mfcc.sh --mfcc-config conf/mfcc_hires.conf --nj 40 \ + ${whole_data_dir}_hires + steps/compute_cmvn_stats.sh ${whole_data_dir}_hires fi +# if [ $stage -le 4.5 ]; then +# # Train a TDNN-LSTM network for SAD +# local/segmentation/tuning/train_lstm_asr_sad_1a.sh \ +# --stage $nstage --train-stage $train_stage \ +# --targets-dir $dir \ +# --data-dir ${whole_data_dir}_hires +# fi + if [ $stage -le 5 ]; then # Train a TDNN-LSTM network for SAD - local/segmentation/tuning/train_lstm_asr_sad_1a.sh \ + + local/segmentation/tuning/train_stats_asr_sad_1a.sh \ --stage $nstage --train-stage $train_stage \ --targets-dir $dir \ - --data-dir ${whole_data_dir}_hires_bp + --data-dir ${whole_data_dir}_hires fi if [ $stage -le 6 ]; then @@ -137,9 +150,37 @@ if [ $stage -le 6 ]; then steps/segmentation/detect_speech_activity.sh \ --extra-left-context 70 --extra-right-context 0 --frames-per-chunk 150 \ --extra-left-context-initial 0 --extra-right-context-final 0 \ - --nj 32 --acwt 0.3 --stage $test_stage \ + --nj 32 --acwt 0.3 --mfcc-config "conf/mfcc_hires.conf" --stage $test_stage \ data/eval2000 \ - exp/segmentation_1a/tdnn_lstm_asr_sad_1a \ - mfcc_hires_bp \ - exp/segmentation_1a/tdnn_lstm_asr_sad_1a/{,eval2000} + exp/segmentation_1a/tdnn_stats_asr_sad_1a2 \ + mfcc_hires \ + exp/segmentation_1a/tdnn_stats_asr_sad_1a2/{,eval2000} +fi + +if [ $stage -le 7 ]; then + # Do some diagnostics + steps/segmentation/evaluate_segmentation.pl data/eval2000/segments \ + exp/segmentation_1a/tdnn_stats_asr_sad_1a2/eval2000_seg/segments &> \ + exp/segmentation_1a/tdnn_stats_asr_sad_1a2/eval2000_seg/evaluate_segmentation.log + + steps/segmentation/convert_utt2spk_and_segments_to_rttm.py \ + exp/segmentation_1a/tdnn_stats_asr_sad_1a2/eval2000_seg/utt2spk \ + exp/segmentation_1a/tdnn_stats_asr_sad_1a2/eval2000_seg/segments \ + exp/segmentation_1a/tdnn_stats_asr_sad_1a2/eval2000_seg/sys.rttm + + steps/segmentation/convert_utt2spk_and_segments_to_rttm.py \ + data/eval2000/utt2spk \ + data/eval2000/segments \ + exp/segmentation_1a/tdnn_stats_asr_sad_1a2/eval2000_seg/ref.rttm + + export PATH=$PATH:$KALDI_ROOT/tools/sctk/bin + md-eval.pl -c 0.25 -r exp/segmentation_1a/tdnn_stats_asr_sad_1a2/eval2000_seg/ref.rttm \ + -s exp/segmentation_1a/tdnn_stats_asr_sad_1a2/eval2000_seg/sys.rttm > \ + exp/segmentation_1a/tdnn_stats_asr_sad_1a2/eval2000_seg/md_eval.log +fi + +if [ $stage -le 8 ]; then + utils/copy_data_dir.sh exp/segmentation_1a/tdnn_stats_asr_sad_1a2/eval2000_seg \ + data/eval2000.seg_asr_sad_1a fi + diff --git a/egs/swbd/s5c/local/run_cleanup_segmentation.sh b/egs/swbd/s5c/local/run_cleanup_segmentation.sh index d08d3f0e0b4..b286f10e0d3 100755 --- a/egs/swbd/s5c/local/run_cleanup_segmentation.sh +++ b/egs/swbd/s5c/local/run_cleanup_segmentation.sh @@ -1,6 +1,7 @@ #!/bin/bash -# Copyright 2016 Vimal Manohar +# 2017 Nagendra Kumar Goel +# 2016 Vimal Manohar # 2016 Johns Hopkins University (author: Daniel Povey) # Apache 2.0 diff --git a/egs/swbd/s5c/local/segmentation/copy_targets_dir.sh b/egs/swbd/s5c/local/segmentation/copy_targets_dir.sh index 8be70b4715a..81c9193d22e 100755 --- a/egs/swbd/s5c/local/segmentation/copy_targets_dir.sh +++ b/egs/swbd/s5c/local/segmentation/copy_targets_dir.sh @@ -1,6 +1,7 @@ #!/bin/bash -# Copyright 2014 Johns Hopkins University (author: Nagendra K Goel) +# Copyright 2017 Nagendra Kumar Goel +# 2014 Johns Hopkins University (author: Nagendra K Goel) # Apache 2.0 # This script operates on a directory, such as in exp/segmentation_1a/train_whole_combined_targets_rev1, diff --git a/egs/swbd/s5c/local/segmentation/tuning/train_lstm_asr_sad_1a.sh b/egs/swbd/s5c/local/segmentation/tuning/train_lstm_asr_sad_1a.sh index 63f78aa8092..9ea3e895f95 100755 --- a/egs/swbd/s5c/local/segmentation/tuning/train_lstm_asr_sad_1a.sh +++ b/egs/swbd/s5c/local/segmentation/tuning/train_lstm_asr_sad_1a.sh @@ -1,12 +1,15 @@ #!/bin/bash +# Copyright 2017 Nagendra Kumar Goel +# Apache 2.0 + # This is a script to train a TDNN-LSTM for speech activity detection (SAD) # using LSTM for long-context information. set -o pipefail set -u -. ./cmd.sh +. cmd.sh # At this script level we don't support not running on GPU, as it would be painfully slow. # If you want to run without GPU you'd have to call train_tdnn.sh with --gpu false, @@ -47,7 +50,7 @@ affix=1a data_dir=exp/segmentation_1a/train_whole_hires_bp targets_dir=exp/segmentation_1a/train_whole_combined_targets_sub3 -. ./cmd.sh +. cmd.sh . ./path.sh . ./utils/parse_options.sh diff --git a/egs/swbd/s5c/local/segmentation/tuning/train_stats_asr_sad_1a.sh b/egs/swbd/s5c/local/segmentation/tuning/train_stats_asr_sad_1a.sh index 2dfe9a0bb96..b3a6b6948a3 100755 --- a/egs/swbd/s5c/local/segmentation/tuning/train_stats_asr_sad_1a.sh +++ b/egs/swbd/s5c/local/segmentation/tuning/train_stats_asr_sad_1a.sh @@ -1,12 +1,15 @@ #!/bin/bash +# Copyright 2017 Nagendra Kumar Goel +# Apache 2.0 + # This is a script to train a TDNN for speech activity detection (SAD) # using statistics pooling for long-context information. set -o pipefail set -u -. ./cmd.sh +. cmd.sh # At this script level we don't support not running on GPU, as it would be painfully slow. # If you want to run without GPU you'd have to call train_tdnn.sh with --gpu false, @@ -46,7 +49,7 @@ affix=1a2 data_dir=exp/segmentation_1a/train_whole_hires_bp targets_dir=exp/segmentation_1a/train_whole_combined_targets_sub3 -. ./cmd.sh +. cmd.sh . ./path.sh . ./utils/parse_options.sh @@ -132,7 +135,7 @@ if [ $stage -le 6 ]; then copy-feats scp:$targets_dir/targets.scp ark:- | \ matrix-sum-rows ark:- ark:- | vector-sum --binary=false ark:- - | \ - awk '{print " [ "$2" "$3" ]"}' > $dir/post_output.vec + awk '{print " [ "$2" "$3" "$4" ]"}' > $dir/post_output.vec echo 3 > $dir/frame_subsampling_factor fi From 36747c4273685b3e88a25b1bacd4e8d3fa2a079e Mon Sep 17 00:00:00 2001 From: Nagendra Kumar Goel Date: Thu, 11 Jan 2018 12:13:10 -0500 Subject: [PATCH 108/174] path.sh convention and comments update --- egs/swbd/s5c/local/run_asr_segmentation.sh | 9 ++++----- egs/swbd/s5c/local/run_cleanup_segmentation.sh | 2 +- .../local/segmentation/tuning/train_lstm_asr_sad_1a.sh | 3 +-- .../local/segmentation/tuning/train_stats_asr_sad_1a.sh | 5 ++--- egs/wsj/s5/steps/segmentation/prepare_targets_gmm.sh | 1 + 5 files changed, 9 insertions(+), 11 deletions(-) diff --git a/egs/swbd/s5c/local/run_asr_segmentation.sh b/egs/swbd/s5c/local/run_asr_segmentation.sh index d87703d1e90..d986a481f8c 100755 --- a/egs/swbd/s5c/local/run_asr_segmentation.sh +++ b/egs/swbd/s5c/local/run_asr_segmentation.sh @@ -4,9 +4,8 @@ # 2017 Vimal Manohar # Apache 2.0 -# We assume the run-1-main.sh (because we are using model directories like -# exp/tri4) and later we assumme run-4-anydecode.sh was run to prepare -# data/dev10h.pem +# We assume the run.sh has been executed (because we are using model +# directories like exp/tri4) lang=data/lang # Must match the one used to train the models lang_test=data/lang_nosp_sw1_tg # Lang directory for decoding. @@ -42,8 +41,8 @@ affix=_1a stage=-1 nj=80 -. path.sh -. cmd.sh +. ./cmd.sh +if [ -f ./path.sh ]; then . ./path.sh; fi set -e -u -o pipefail . utils/parse_options.sh diff --git a/egs/swbd/s5c/local/run_cleanup_segmentation.sh b/egs/swbd/s5c/local/run_cleanup_segmentation.sh index b286f10e0d3..8b08422d277 100755 --- a/egs/swbd/s5c/local/run_cleanup_segmentation.sh +++ b/egs/swbd/s5c/local/run_cleanup_segmentation.sh @@ -31,8 +31,8 @@ nj=100 decode_nj=16 decode_num_threads=4 -. ./path.sh . ./cmd.sh +if [ -f ./path.sh ]; then . ./path.sh; fi . utils/parse_options.sh cleaned_data=${data}_${cleanup_affix} diff --git a/egs/swbd/s5c/local/segmentation/tuning/train_lstm_asr_sad_1a.sh b/egs/swbd/s5c/local/segmentation/tuning/train_lstm_asr_sad_1a.sh index 9ea3e895f95..e3baa67b606 100755 --- a/egs/swbd/s5c/local/segmentation/tuning/train_lstm_asr_sad_1a.sh +++ b/egs/swbd/s5c/local/segmentation/tuning/train_lstm_asr_sad_1a.sh @@ -50,8 +50,7 @@ affix=1a data_dir=exp/segmentation_1a/train_whole_hires_bp targets_dir=exp/segmentation_1a/train_whole_combined_targets_sub3 -. cmd.sh -. ./path.sh +if [ -f ./path.sh ]; then . ./path.sh; fi . ./utils/parse_options.sh if [ -z "$dir" ]; then diff --git a/egs/swbd/s5c/local/segmentation/tuning/train_stats_asr_sad_1a.sh b/egs/swbd/s5c/local/segmentation/tuning/train_stats_asr_sad_1a.sh index b3a6b6948a3..842f96ce1b9 100755 --- a/egs/swbd/s5c/local/segmentation/tuning/train_stats_asr_sad_1a.sh +++ b/egs/swbd/s5c/local/segmentation/tuning/train_stats_asr_sad_1a.sh @@ -9,7 +9,7 @@ set -o pipefail set -u -. cmd.sh +. ./cmd.sh # At this script level we don't support not running on GPU, as it would be painfully slow. # If you want to run without GPU you'd have to call train_tdnn.sh with --gpu false, @@ -49,8 +49,7 @@ affix=1a2 data_dir=exp/segmentation_1a/train_whole_hires_bp targets_dir=exp/segmentation_1a/train_whole_combined_targets_sub3 -. cmd.sh -. ./path.sh +if [ -f ./path.sh ]; then . ./path.sh; fi . ./utils/parse_options.sh if [ -z "$dir" ]; then diff --git a/egs/wsj/s5/steps/segmentation/prepare_targets_gmm.sh b/egs/wsj/s5/steps/segmentation/prepare_targets_gmm.sh index f8557a70177..bc646986eea 100755 --- a/egs/wsj/s5/steps/segmentation/prepare_targets_gmm.sh +++ b/egs/wsj/s5/steps/segmentation/prepare_targets_gmm.sh @@ -1,6 +1,7 @@ #! /bin/bash # Copyright 2017 Vimal Manohar +# 2017 Nagendra Kumar Goel # Apache 2.0 # This script prepares targets for training neural network for From 6390477ce0ce8e21806886f3362e5625d7c37c8e Mon Sep 17 00:00:00 2001 From: Nagendra Kumar Goel Date: Fri, 12 Jan 2018 17:25:17 -0500 Subject: [PATCH 109/174] add options for noise and reverberations --- egs/swbd/s5c/local/run_asr_segmentation.sh | 101 +++++++++++++----- .../segmentation/combine_targets_dirs.sh | 83 ++++++++++++++ .../tuning/train_stats_asr_sad_1a.sh | 13 +-- .../steps/segmentation/prepare_targets_gmm.sh | 2 +- 4 files changed, 163 insertions(+), 36 deletions(-) create mode 100755 egs/swbd/s5c/local/segmentation/combine_targets_dirs.sh diff --git a/egs/swbd/s5c/local/run_asr_segmentation.sh b/egs/swbd/s5c/local/run_asr_segmentation.sh index d986a481f8c..4d3356dc7b0 100755 --- a/egs/swbd/s5c/local/run_asr_segmentation.sh +++ b/egs/swbd/s5c/local/run_asr_segmentation.sh @@ -36,7 +36,8 @@ prepare_targets_stage=-10 nstage=-10 train_stage=-10 test_stage=-10 - +num_data_reps=1 +base_rirs=simulated affix=_1a stage=-1 nj=80 @@ -77,6 +78,7 @@ if ! cat $dir/garbage_phones.txt $dir/silence_phones.txt | \ fi whole_data_dir=${data_dir}_whole +rvb_data_dir=${whole_data_dir}_rvb if [ $stage -le 0 ]; then utils/data/convert_data_dir_to_whole.sh $data_dir $whole_data_dir @@ -115,30 +117,76 @@ if [ $stage -le 3 ]; then fi if [ $stage -le 4 ]; then - utils/copy_data_dir.sh ${whole_data_dir} ${whole_data_dir}_hires + # Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises + if [ ! -f rirs_noises.zip ]; then + wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip + unzip rirs_noises.zip + fi + + rvb_opts=() + if [ "$base_rirs" == "simulated" ]; then + # This is the config for the system using simulated RIRs and point-source noises + rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/smallroom/rir_list") + rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/mediumroom/rir_list") + rvb_opts+=(--noise-set-parameters RIRS_NOISES/pointsource_noises/noise_list) + else + # This is the config for the JHU ASpIRE submission system + rvb_opts+=(--rir-set-parameters "1.0, RIRS_NOISES/real_rirs_isotropic_noises/rir_list") + rvb_opts+=(--noise-set-parameters RIRS_NOISES/real_rirs_isotropic_noises/noise_list) + fi + + foreground_snrs="20:10:15:5:0" + background_snrs="20:10:15:5:0" + num_reps=1 + # corrupt the data to generate multi-condition data + # for data_dir in train dev test; do + python steps/data/reverberate_data_dir.py \ + "${rvb_opts[@]}" \ + --prefix "rev" \ + --foreground-snrs $foreground_snrs \ + --background-snrs $background_snrs \ + --speech-rvb-probability 0.5 \ + --pointsource-noise-addition-probability 0.5 \ + --isotropic-noise-addition-probability 0.7 \ + --num-replications $num_reps \ + --max-noises-per-minute 4 \ + --source-sampling-rate 8000 \ + $whole_data_dir $rvb_data_dir + + for i in `seq 1 $num_data_reps`; do + local/segmentation/copy_targets_dir.sh --cmd "$decode_cmd" --utt-prefix "rev${i}_" exp/segmentation_1a/train_whole_combined_targets_sub3 exp/segmentation_1a/train_whole_combined_targets_sub3_temp_$i || exit 1; + rvb_dirs+=" exp/segmentation_1a/train_whole_combined_targets_sub3_temp_$i" + done + + local/segmentation/combine_targets_dirs.sh $rvb_data_dir exp/segmentation_1a/train_whole_combined_targets_sub3_rvb $rvb_dirs || exit 1; + cp exp/segmentation_1a/train_whole_combined_targets_sub3_rvb/targets.scp exp/segmentation_1a/ +fi + +if [ $stage -le 5 ]; then + utils/copy_data_dir.sh ${rvb_data_dir} ${rvb_data_dir}_hires steps/make_mfcc.sh --mfcc-config conf/mfcc_hires.conf --nj 40 \ - ${whole_data_dir}_hires - steps/compute_cmvn_stats.sh ${whole_data_dir}_hires + ${rvb_data_dir}_hires + steps/compute_cmvn_stats.sh ${rvb_data_dir}_hires fi -# if [ $stage -le 4.5 ]; then +# if [ $stage -le 6 ]; then # # Train a TDNN-LSTM network for SAD # local/segmentation/tuning/train_lstm_asr_sad_1a.sh \ # --stage $nstage --train-stage $train_stage \ # --targets-dir $dir \ -# --data-dir ${whole_data_dir}_hires +# --data-dir ${rvb_data_dir}_hires # fi -if [ $stage -le 5 ]; then - # Train a TDNN-LSTM network for SAD +if [ $stage -le 6 ]; then + # Train a STATS-pooling network for SAD local/segmentation/tuning/train_stats_asr_sad_1a.sh \ --stage $nstage --train-stage $train_stage \ --targets-dir $dir \ - --data-dir ${whole_data_dir}_hires + --data-dir ${rvb_data_dir}_hires fi -if [ $stage -le 6 ]; then +if [ $stage -le 7 ]; then # The options to this script must match the options used in the # nnet training script. # e.g. extra-left-context is 70, because the model is an LSTM trained with a @@ -149,37 +197,32 @@ if [ $stage -le 6 ]; then steps/segmentation/detect_speech_activity.sh \ --extra-left-context 70 --extra-right-context 0 --frames-per-chunk 150 \ --extra-left-context-initial 0 --extra-right-context-final 0 \ - --nj 32 --acwt 0.3 --mfcc-config "conf/mfcc_hires.conf" --stage $test_stage \ + --nj 32 --acwt 0.3 --stage $test_stage \ data/eval2000 \ exp/segmentation_1a/tdnn_stats_asr_sad_1a2 \ mfcc_hires \ exp/segmentation_1a/tdnn_stats_asr_sad_1a2/{,eval2000} fi -if [ $stage -le 7 ]; then +if [ $stage -le 8 ]; then # Do some diagnostics - steps/segmentation/evaluate_segmentation.pl data/eval2000/segments \ - exp/segmentation_1a/tdnn_stats_asr_sad_1a2/eval2000_seg/segments &> \ - exp/segmentation_1a/tdnn_stats_asr_sad_1a2/eval2000_seg/evaluate_segmentation.log + steps/segmentation/evalute_segmentation.pl data/dev10h.pem/segments \ + exp/segmentation_1a/tdnn_stats_asr_sad_1a2/dev10h_seg/segments &> \ + exp/segmentation_1a/tdnn_stats_asr_sad_1a2/dev10h_seg/evalutate_segmentation.log steps/segmentation/convert_utt2spk_and_segments_to_rttm.py \ - exp/segmentation_1a/tdnn_stats_asr_sad_1a2/eval2000_seg/utt2spk \ - exp/segmentation_1a/tdnn_stats_asr_sad_1a2/eval2000_seg/segments \ - exp/segmentation_1a/tdnn_stats_asr_sad_1a2/eval2000_seg/sys.rttm + exp/segmentation_1a/tdnn_stats_asr_sad_1a2/dev10h_seg/utt2spk \ + exp/segmentation_1a/tdnn_stats_asr_sad_1a2/dev10h_seg/segments \ + exp/segmentation_1a/tdnn_stats_asr_sad_1a2/dev10h_seg/sys.rttm - steps/segmentation/convert_utt2spk_and_segments_to_rttm.py \ - data/eval2000/utt2spk \ - data/eval2000/segments \ - exp/segmentation_1a/tdnn_stats_asr_sad_1a2/eval2000_seg/ref.rttm - export PATH=$PATH:$KALDI_ROOT/tools/sctk/bin - md-eval.pl -c 0.25 -r exp/segmentation_1a/tdnn_stats_asr_sad_1a2/eval2000_seg/ref.rttm \ - -s exp/segmentation_1a/tdnn_stats_asr_sad_1a2/eval2000_seg/sys.rttm > \ - exp/segmentation_1a/tdnn_stats_asr_sad_1a2/eval2000_seg/md_eval.log + md-eval.pl -c 0.25 -r $dev10h_rttm_file \ + -s exp/segmentation_1a/tdnn_stats_asr_sad_1a2/dev10h_seg/sys.rttm > \ + exp/segmentation_1a/tdnn_stats_asr_sad_1a2/dev10h_seg/md_eval.log fi -if [ $stage -le 8 ]; then - utils/copy_data_dir.sh exp/segmentation_1a/tdnn_stats_asr_sad_1a2/eval2000_seg \ - data/eval2000.seg_asr_sad_1a +if [ $stage -le 9 ]; then + utils/copy_data_dir.sh exp/segmentation_1a/tdnn_stats_asr_sad_1a2/dev10h_seg \ + data/dev10h.seg_asr_sad_1a fi diff --git a/egs/swbd/s5c/local/segmentation/combine_targets_dirs.sh b/egs/swbd/s5c/local/segmentation/combine_targets_dirs.sh new file mode 100755 index 00000000000..48c4ce93db0 --- /dev/null +++ b/egs/swbd/s5c/local/segmentation/combine_targets_dirs.sh @@ -0,0 +1,83 @@ +#!/bin/bash +# Copyright 2017 Nagendra Kumar Goel +# Apache 2.0. + +# This srcipt operates on targets directories, such as exp/segmentation_1a/train_whole_combined_targets_sub3 +# the output is a new targets dir which has targets from all the input targets dirs + +# Begin configuration section. +cmd=run.pl +extra_files= +num_jobs=4 +# End configuration section. +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + +if [[ $# -lt 3 ]]; then + echo "Usage: $0 [options] ..." + echo "e.g.: $0 --num-jobs 32 data/train exp/targets_combined exp/targets_1 exp/targets_2" + echo "Options:" + echo " --extra-files # specify addtional files in 'src-targets-dir1' to copy" + echo " --num-jobs # number of jobs used to split the data directory." + echo " Note, files that don't appear in the first source dir will not be added even if they appear in later ones." + echo " Other than alignments, only files from the first src ali dir are copied." + exit 1; +fi + +data=$1; +shift; +dest=$1; +shift; +first_src=$1; + +mkdir -p $dest; +rm $dest/{targets.*.ark,frame_subsampling_factor} 2>/dev/null + +cp $first_src/frame_subsampling_factor $dest 2>/dev/null + +export LC_ALL=C + +for dir in $*; do + if [ ! -f $dir/targets.1.ark ]; then + echo "$0: check if targets (targets.*.ark) are present in $dir." + exit 1; + fi +done + +for dir in $*; do + for f in frame_subsampling_factor; do + diff $first_src/$f $dir/$f 1>/dev/null 2>&1 + if [ $? -ne 0 ]; then + echo "$0: Cannot combine alignment directories with different $f files." + fi + done +done + +for f in frame_subsampling_factor $extra_files; do + if [ ! -f $first_src/$f ]; then + echo "combine_targets_dir.sh: no such file $first_src/$f" + exit 1; + fi + cp $first_src/$f $dest/ +done + +src_id=0 +temp_dir=$dest/temp +[ -d $temp_dir ] && rm -r $temp_dir; +mkdir -p $temp_dir +echo "$0: dumping targets in each source directory as single archive and index." +for dir in $*; do + src_id=$((src_id + 1)) + cur_num_jobs=$(ls $dir/targets.*.ark | wc -l) || exit 1; + tgts=$(for n in $(seq $cur_num_jobs); do echo -n "$dir/targets.$n.ark "; done) + $cmd $dir/log/copy_targets.log \ + copy-matrix "ark:cat $tgts|" \ + ark,scp:$temp_dir/targets.$src_id.ark,$temp_dir/targets.$src_id.scp || exit 1; +done +sort -m $temp_dir/targets.*.scp > $dest/targets.scp || exit 1; + + +echo "Combined targets and stored in $dest" +exit 0 diff --git a/egs/swbd/s5c/local/segmentation/tuning/train_stats_asr_sad_1a.sh b/egs/swbd/s5c/local/segmentation/tuning/train_stats_asr_sad_1a.sh index 842f96ce1b9..feb88a53454 100755 --- a/egs/swbd/s5c/local/segmentation/tuning/train_stats_asr_sad_1a.sh +++ b/egs/swbd/s5c/local/segmentation/tuning/train_stats_asr_sad_1a.sh @@ -1,15 +1,15 @@ #!/bin/bash # Copyright 2017 Nagendra Kumar Goel +# 2016 Vimal Manohar # Apache 2.0 - # This is a script to train a TDNN for speech activity detection (SAD) # using statistics pooling for long-context information. set -o pipefail set -u -. ./cmd.sh +. cmd.sh # At this script level we don't support not running on GPU, as it would be painfully slow. # If you want to run without GPU you'd have to call train_tdnn.sh with --gpu false, @@ -30,11 +30,11 @@ extra_right_context=21 relu_dim=256 # training options -num_epochs=4 +num_epochs=2 initial_effective_lrate=0.0003 final_effective_lrate=0.00003 -num_jobs_initial=3 -num_jobs_final=8 +num_jobs_initial=1 +num_jobs_final=1 remove_egs=true max_param_change=0.2 # Small max-param change for small network @@ -49,6 +49,7 @@ affix=1a2 data_dir=exp/segmentation_1a/train_whole_hires_bp targets_dir=exp/segmentation_1a/train_whole_combined_targets_sub3 +. cmd.sh if [ -f ./path.sh ]; then . ./path.sh; fi . ./utils/parse_options.sh @@ -134,7 +135,7 @@ if [ $stage -le 6 ]; then copy-feats scp:$targets_dir/targets.scp ark:- | \ matrix-sum-rows ark:- ark:- | vector-sum --binary=false ark:- - | \ - awk '{print " [ "$2" "$3" "$4" ]"}' > $dir/post_output.vec + awk '{print " [ "$2" "$3" ]"}' > $dir/post_output.vec echo 3 > $dir/frame_subsampling_factor fi diff --git a/egs/wsj/s5/steps/segmentation/prepare_targets_gmm.sh b/egs/wsj/s5/steps/segmentation/prepare_targets_gmm.sh index bc646986eea..de19cfc6772 100755 --- a/egs/wsj/s5/steps/segmentation/prepare_targets_gmm.sh +++ b/egs/wsj/s5/steps/segmentation/prepare_targets_gmm.sh @@ -211,7 +211,7 @@ if [ $stage -le 5 ]; then # the speech / silence decisions, not the exact word sequences. steps/decode.sh --cmd "$decode_cmd --mem 2G" --nj $nj \ --max-active 1000 --beam 10.0 \ - --decode-extra-opts "--word-determinize=false" --skip-scoring true \ + --skip-scoring true \ $graph_dir $uniform_seg_data_dir $decode_dir fi From b62c2a87ce2bbdbc65f48afd6f6675b97c68c7f7 Mon Sep 17 00:00:00 2001 From: Nagendra Kumar Goel Date: Tue, 16 Jan 2018 08:46:49 -0500 Subject: [PATCH 110/174] Fix bugs in evaluations part --- egs/swbd/s5c/local/run_asr_segmentation.sh | 30 +++++++++---------- .../tuning/train_lstm_asr_sad_1a.sh | 2 +- .../tuning/train_stats_asr_sad_1a.sh | 12 ++++---- .../segmentation/detect_speech_activity.sh | 12 ++++---- 4 files changed, 30 insertions(+), 26 deletions(-) diff --git a/egs/swbd/s5c/local/run_asr_segmentation.sh b/egs/swbd/s5c/local/run_asr_segmentation.sh index 4d3356dc7b0..7129e905480 100755 --- a/egs/swbd/s5c/local/run_asr_segmentation.sh +++ b/egs/swbd/s5c/local/run_asr_segmentation.sh @@ -36,7 +36,7 @@ prepare_targets_stage=-10 nstage=-10 train_stage=-10 test_stage=-10 -num_data_reps=1 +num_data_reps=2 base_rirs=simulated affix=_1a stage=-1 @@ -164,7 +164,7 @@ fi if [ $stage -le 5 ]; then utils/copy_data_dir.sh ${rvb_data_dir} ${rvb_data_dir}_hires - steps/make_mfcc.sh --mfcc-config conf/mfcc_hires.conf --nj 40 \ + steps/make_mfcc.sh --mfcc-config conf/mfcc_hires.conf --nj 10 \ ${rvb_data_dir}_hires steps/compute_cmvn_stats.sh ${rvb_data_dir}_hires fi @@ -206,23 +206,23 @@ fi if [ $stage -le 8 ]; then # Do some diagnostics - steps/segmentation/evalute_segmentation.pl data/dev10h.pem/segments \ - exp/segmentation_1a/tdnn_stats_asr_sad_1a2/dev10h_seg/segments &> \ - exp/segmentation_1a/tdnn_stats_asr_sad_1a2/dev10h_seg/evalutate_segmentation.log + steps/segmentation/evaluate_segmentation.pl data/eval2000/segments \ + exp/segmentation_1a/tdnn_stats_asr_sad_1a2/eval2000_seg/segments &> \ + exp/segmentation_1a/tdnn_stats_asr_sad_1a2/eval2000_seg/evalutate_segmentation.log steps/segmentation/convert_utt2spk_and_segments_to_rttm.py \ - exp/segmentation_1a/tdnn_stats_asr_sad_1a2/dev10h_seg/utt2spk \ - exp/segmentation_1a/tdnn_stats_asr_sad_1a2/dev10h_seg/segments \ - exp/segmentation_1a/tdnn_stats_asr_sad_1a2/dev10h_seg/sys.rttm - - export PATH=$PATH:$KALDI_ROOT/tools/sctk/bin - md-eval.pl -c 0.25 -r $dev10h_rttm_file \ - -s exp/segmentation_1a/tdnn_stats_asr_sad_1a2/dev10h_seg/sys.rttm > \ - exp/segmentation_1a/tdnn_stats_asr_sad_1a2/dev10h_seg/md_eval.log + exp/segmentation_1a/tdnn_stats_asr_sad_1a2/eval2000_seg/utt2spk \ + exp/segmentation_1a/tdnn_stats_asr_sad_1a2/eval2000_seg/segments \ + exp/segmentation_1a/tdnn_stats_asr_sad_1a2/eval2000_seg/sys.rttm + +# export PATH=$PATH:$KALDI_ROOT/tools/sctk/bin +# md-eval.pl -c 0.25 -r $eval2000_rttm_file \ +# -s exp/segmentation_1a/tdnn_stats_asr_sad_1a2/eval2000_seg/sys.rttm > \ +# exp/segmentation_1a/tdnn_stats_asr_sad_1a2/eval2000_seg/md_eval.log fi if [ $stage -le 9 ]; then - utils/copy_data_dir.sh exp/segmentation_1a/tdnn_stats_asr_sad_1a2/dev10h_seg \ - data/dev10h.seg_asr_sad_1a + utils/copy_data_dir.sh exp/segmentation_1a/tdnn_stats_asr_sad_1a2/eval2000_seg \ + data/eval2000.seg_asr_sad_1a fi diff --git a/egs/swbd/s5c/local/segmentation/tuning/train_lstm_asr_sad_1a.sh b/egs/swbd/s5c/local/segmentation/tuning/train_lstm_asr_sad_1a.sh index e3baa67b606..74697df099f 100755 --- a/egs/swbd/s5c/local/segmentation/tuning/train_lstm_asr_sad_1a.sh +++ b/egs/swbd/s5c/local/segmentation/tuning/train_lstm_asr_sad_1a.sh @@ -9,7 +9,7 @@ set -o pipefail set -u -. cmd.sh +. ./cmd.sh # At this script level we don't support not running on GPU, as it would be painfully slow. # If you want to run without GPU you'd have to call train_tdnn.sh with --gpu false, diff --git a/egs/swbd/s5c/local/segmentation/tuning/train_stats_asr_sad_1a.sh b/egs/swbd/s5c/local/segmentation/tuning/train_stats_asr_sad_1a.sh index feb88a53454..3254929306f 100755 --- a/egs/swbd/s5c/local/segmentation/tuning/train_stats_asr_sad_1a.sh +++ b/egs/swbd/s5c/local/segmentation/tuning/train_stats_asr_sad_1a.sh @@ -9,7 +9,7 @@ set -o pipefail set -u -. cmd.sh +. ./cmd.sh # At this script level we don't support not running on GPU, as it would be painfully slow. # If you want to run without GPU you'd have to call train_tdnn.sh with --gpu false, @@ -30,7 +30,7 @@ extra_right_context=21 relu_dim=256 # training options -num_epochs=2 +num_epochs=1 initial_effective_lrate=0.0003 final_effective_lrate=0.00003 num_jobs_initial=1 @@ -46,7 +46,7 @@ config_dir= dir= affix=1a2 -data_dir=exp/segmentation_1a/train_whole_hires_bp +data_dir=exp/segmentation_1a/train_whole_rvb_hires targets_dir=exp/segmentation_1a/train_whole_combined_targets_sub3 . cmd.sh @@ -132,10 +132,12 @@ if [ $stage -le 6 ]; then --targets-scp="$targets_dir/targets.scp" \ --egs.opts="--frame-subsampling-factor 3 --num-utts-subset $num_utts_subset" \ --dir=$dir || exit 1 +fi - copy-feats scp:$targets_dir/targets.scp ark:- | \ +if [ $stage -le 7 ]; then + copy-feats scp:$targets_dir/targets.scp ark:- | \ matrix-sum-rows ark:- ark:- | vector-sum --binary=false ark:- - | \ - awk '{print " [ "$2" "$3" ]"}' > $dir/post_output.vec + awk '{print " [ "$2" "$3" "$4" ]"}' > $dir/post_output.vec echo 3 > $dir/frame_subsampling_factor fi diff --git a/egs/wsj/s5/steps/segmentation/detect_speech_activity.sh b/egs/wsj/s5/steps/segmentation/detect_speech_activity.sh index 69f47c28d60..9bc8eea675c 100755 --- a/egs/wsj/s5/steps/segmentation/detect_speech_activity.sh +++ b/egs/wsj/s5/steps/segmentation/detect_speech_activity.sh @@ -1,6 +1,7 @@ #!/bin/bash # Copyright 2016-17 Vimal Manohar +# 2017 Nagendra Kumar Goel # Apache 2.0. # This script does nnet3-based speech activity detection given an input @@ -12,16 +13,17 @@ set -e set -o pipefail set -u -. ./path.sh +. ./cmd.sh +if [ -f ./path.sh ]; then . ./path.sh; fi affix= # Affix for the segmentation nj=32 -cmd=queue.pl +cmd=$decode_cmd stage=-1 # Feature options (Must match training) -mfcc_config=conf/mfcc_hires_bp.conf -feat_affix=bp # Affix for the type of feature used +mfcc_config=conf/mfcc_hires.conf +feat_affix=hires # Affix for the type of feature used convert_data_dir_to_whole=true # If true, the input data directory is # first converted to whole data directory (i.e. whole recordings) @@ -67,7 +69,7 @@ if [ $# -ne 5 ]; then echo "See script for details of the options to be supplied." echo "Usage: $0 " echo " e.g.: $0 ~/workspace/egs/ami/s5b/data/sdm1/dev exp/nnet3_sad_snr/nnet_tdnn_j_n4 \\" - echo " mfcc_hires_bp exp/segmentation_sad_snr/nnet_tdnn_j_n4 data/ami_sdm1_dev" + echo " mfcc_hires exp/segmentation_sad_snr/nnet_tdnn_j_n4 data/ami_sdm1_dev" echo "" echo "Options: " echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." From 0e100189b4a8749b1d7a3bd82090271b08511638 Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Mon, 22 Jan 2018 11:38:28 -0500 Subject: [PATCH 111/174] semisup-smbr: add one-silence-class and exclude-silence options for chain-smbr --- src/chain/chain-denominator-smbr.cc | 12 ++- src/chain/chain-denominator.cc | 8 +- src/chain/chain-supervision-test.cc | 6 +- src/chain/chain-training.cc | 21 +++++- src/chain/chain-training.h | 12 +++ src/cudamatrix/cu-kernels-ansi.h | 6 ++ src/cudamatrix/cu-kernels.cu | 33 +++++++++ src/cudamatrix/cu-kernels.h | 10 +++ src/cudamatrix/cu-matrix.cc | 28 +++++++ src/cudamatrix/cu-matrix.h | 9 ++- src/nnet3/nnet-chain-diagnostics.cc | 85 +++++++++++++++------ src/nnet3/nnet-chain-diagnostics.h | 11 ++- src/nnet3/nnet-chain-training.cc | 110 +++++++++++++++++++--------- src/nnet3/nnet-training.cc | 31 +++++++- src/nnet3/nnet-training.h | 20 ++++- 15 files changed, 331 insertions(+), 71 deletions(-) diff --git a/src/chain/chain-denominator-smbr.cc b/src/chain/chain-denominator-smbr.cc index e365599eb4b..ab865688c91 100644 --- a/src/chain/chain-denominator-smbr.cc +++ b/src/chain/chain-denominator-smbr.cc @@ -541,7 +541,7 @@ void DenominatorSmbrComputation::BetaSmbrGeneralFrameDebug(int32 t) { BaseFloat alpha_beta_product = VecVec(this_alpha_dash, this_beta_dash), this_log_prob_deriv_sum = this_log_prob_deriv.Sum(); - if (GetVerboseLevel() > 1 || !ApproxEqual(alpha_beta_product, num_sequences_)) { + if (!ApproxEqual(alpha_beta_product, num_sequences_)) { KALDI_WARN << "On time " << t << ", alpha-beta product " << alpha_beta_product << " != " << num_sequences_ << " alpha-dash-sum = " << this_alpha_dash.Sum() @@ -550,6 +550,11 @@ void DenominatorSmbrComputation::BetaSmbrGeneralFrameDebug(int32 t) { KALDI_WARN << "Excessive error detected, will abandon this minibatch"; ok_ = false; } + } else { + KALDI_VLOG(1) << "On time " << t << ", alpha-beta product = " + << alpha_beta_product + << ", alpha-dash-sum = " << this_alpha_dash.Sum() + << ", beta-dash-sum = " << this_beta_dash.Sum(); } // alpha_smbr_vec is a vector of size 'num_hmm_states' * 'num_sequences_' @@ -565,9 +570,12 @@ void DenominatorSmbrComputation::BetaSmbrGeneralFrameDebug(int32 t) { / alpha_beta_product * num_sequences_, tot_smbr_sum = tot_smbr_.Sum(); KALDI_ASSERT (alpha_beta_smbr_sum - alpha_beta_smbr_sum == 0.0); - if (GetVerboseLevel() > 1 || !ApproxEqual(tot_smbr_sum, alpha_beta_smbr_sum, 0.01)) { + if (!ApproxEqual(tot_smbr_sum, alpha_beta_smbr_sum, 0.01)) { KALDI_WARN << "On time " << t << ", alpha-beta-smbr " << alpha_beta_smbr_sum << " != " << tot_smbr_sum; + } else { + KALDI_VLOG(1) << "On time " << t << ", alpha-beta-smbr " + << alpha_beta_smbr_sum << " = tot-smbr-sum"; } //// use higher tolerance, since we are using randomized pruning for the diff --git a/src/chain/chain-denominator.cc b/src/chain/chain-denominator.cc index 3a9c350bbfe..781bc4e64ee 100644 --- a/src/chain/chain-denominator.cc +++ b/src/chain/chain-denominator.cc @@ -401,7 +401,7 @@ void DenominatorComputation::BetaGeneralFrameDebug(int32 t) { BaseFloat alpha_beta_product = VecVec(this_alpha_dash, this_beta_dash), this_log_prob_deriv_sum = this_log_prob_deriv.Sum(); - if (GetVerboseLevel() > 1 || !ApproxEqual(alpha_beta_product, num_sequences_)) { + if (!ApproxEqual(alpha_beta_product, num_sequences_)) { KALDI_WARN << "On time " << t << ", alpha-beta product " << alpha_beta_product << " != " << num_sequences_ << " alpha-dash-sum = " << this_alpha_dash.Sum() @@ -410,6 +410,12 @@ void DenominatorComputation::BetaGeneralFrameDebug(int32 t) { KALDI_WARN << "Excessive error detected, will abandon this minibatch"; ok_ = false; } + } else { + KALDI_VLOG(1) << "On time " << t << ", alpha-beta product = " + << alpha_beta_product + << ", alpha-dash-sum = " << this_alpha_dash.Sum() + << ", beta-dash-sum = " << this_beta_dash.Sum(); + } // use higher tolerance, since we are using randomized pruning for the // log-prob derivatives. diff --git a/src/chain/chain-supervision-test.cc b/src/chain/chain-supervision-test.cc index 27c9ccaf438..ca9ac8db604 100644 --- a/src/chain/chain-supervision-test.cc +++ b/src/chain/chain-supervision-test.cc @@ -813,9 +813,9 @@ int main() { else CuDevice::Instantiate().SelectGpuId("yes"); #endif - for (int32 i = 0; i < 3; i++) { - kaldi::chain::ChainSupervisionTest(); - kaldi::chain::ChainSupervisionSimpleTest(); + for (int32 i = 0; i < 6; i++) { + if (i % 2 == 0) kaldi::chain::ChainSupervisionTest(); + else kaldi::chain::ChainSupervisionSimpleTest(); kaldi::chain::BreadthFirstTest(); } kaldi::chain::TestRanges(); diff --git a/src/chain/chain-training.cc b/src/chain/chain-training.cc index 344e0613f3f..84e9c75fe00 100644 --- a/src/chain/chain-training.cc +++ b/src/chain/chain-training.cc @@ -148,13 +148,30 @@ void ComputeChainSmbrObjfAndDeriv(const ChainTrainingOptions &opts, } } - if (sil_indices) + if (sil_indices && opts.exclude_silence) { + // Exclude numerator posteriors for silence pdfs from accuracy + // computation. This is done by setting silence pdf posteiors to zero. + // sil_indices is expected to have -1 at the indexes corresponding to + // silence pdfs, and "i" for any other index "i". num_posteriors.CopyCols(num_posteriors, *sil_indices); + } else if (sil_indices && opts.one_silence_class) { + // Create a copy with only the silence pdf posteriors. + CuMatrix silence_post(nnet_output.NumRows(), + nnet_output.NumCols()); + silence_post.CopyCols(num_posteriors, *sil_indices); + + // Sum the posteriors of silence pdfs to get posterior of silence class. + CuVector total_silence_post(nnet_output.NumRows()); + total_silence_post.AddColSumMat(1.0, silence_post, 0.0); + + // Copy the silence class posterior to the columns of the silence pdfs. + num_posteriors.CopyColsFromVec(total_silence_post, *sil_indices); + } DenominatorSmbrComputation denominator(opts, den_graph, supervision.num_sequences, nnet_output, num_posteriors); - + BaseFloat den_logprob_negated; BaseFloat smbr_objf = denominator.ForwardSmbr(&den_logprob_negated); diff --git a/src/chain/chain-training.h b/src/chain/chain-training.h index 221d0ed78f5..46ddbfa2228 100644 --- a/src/chain/chain-training.h +++ b/src/chain/chain-training.h @@ -62,6 +62,8 @@ struct ChainTrainingOptions { BaseFloat xent_regularize; bool use_smbr_objective; + bool exclude_silence; + bool one_silence_class; std::string silence_pdfs_str; @@ -70,6 +72,7 @@ struct ChainTrainingOptions { ChainTrainingOptions(): l2_regularize(0.0), leaky_hmm_coefficient(1.0e-05), xent_regularize(0.0), use_smbr_objective(false), + exclude_silence(false), one_silence_class(false), mmi_factor(0.0), smbr_factor(1.0) { } void Register(OptionsItf *opts) { @@ -98,6 +101,15 @@ struct ChainTrainingOptions { opts->Register("smbr-factor", &smbr_factor, "When using smbr objective, interpolate smbr objective " "with this weight"); + opts->Register("exclude-silence", &exclude_silence, + "Exclude numerator posteriors " + "of silence pdfs from accuracy computation in " + "sMBR training. --silence-pdfs is required if " + "this option is true."); + opts->Register("one-silence-class", &one_silence_class, + "Treat all silence pdfs as a single class for accuracy " + "computation in smBR training. --silence-pdfs is required " + "if this options is true."); } }; diff --git a/src/cudamatrix/cu-kernels-ansi.h b/src/cudamatrix/cu-kernels-ansi.h index f8d56972a3e..b26b88b0d9a 100644 --- a/src/cudamatrix/cu-kernels-ansi.h +++ b/src/cudamatrix/cu-kernels-ansi.h @@ -219,6 +219,12 @@ void cudaD_copy_cols_from_vec(dim3 Gr, dim3 Bl, double *mat_out, MatrixDim d_out, const double *v_in); void cudaF_copy_cols_from_vec(dim3 Gr, dim3 Bl, float *mat_out, MatrixDim d_out, const float *v_in); +void cudaD_copy_cols_at_indices_from_vec(dim3 Gr, dim3 Bl, double *mat_out, + const double *v_in, const MatrixIndexT_cuda* indices, + MatrixDim d_out); +void cudaF_copy_cols_at_indices_from_vec(dim3 Gr, dim3 Bl, float *mat_out, + const float *v_in, const MatrixIndexT_cuda* indices, + MatrixDim d_out); void cudaD_copy(dim3 Gr, dim3 Bl, double *y, const double *x, const int32_cuda *copy_from, MatrixDim d_out, MatrixDim d_in); void cudaF_copy(dim3 Gr, dim3 Bl, float *y, const float *x, diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu index b4b15798a4a..14a301070af 100644 --- a/src/cudamatrix/cu-kernels.cu +++ b/src/cudamatrix/cu-kernels.cu @@ -795,6 +795,25 @@ static void _copy_cols_from_vec(Real* m_out, MatrixDim d, const Real* v_in) { } } +// This kernel writes a copy of the vector "v_in" to each col i of the matrix +// "m_out", where indices[i] != -1. If indices[i] == -1, then that column is +// left as is. +// the dimension of v_in should be equal to the #row of m_out. +// the dimension of indices should be equal to the #col of m_out. +template +__global__ +static void _copy_cols_at_indices_from_vec(Real* m_out, const Real* v_in, + const MatrixIndexT_cuda* indices, + MatrixDim d) { + int i = blockIdx.y * blockDim.y + threadIdx.y; // row id + int j = blockIdx.x * blockDim.x + threadIdx.x; // col id + if (i < d.rows && j < d.cols) { + if (indices[j] != -1) { + m_out[i * d.stride + j] = v_in[i]; + } + } +} + // _trace_mat_mat reduce the partial sum to // value[blockIdx.y * gridDim.x + blockIdx.x] // It use shared mem to transpose matrix B to ensure coalesced memory access @@ -4884,6 +4903,20 @@ void cudaF_copy_cols_from_vec(dim3 Gr, dim3 Bl, float *mat_out, MatrixDim d_out, _copy_cols_from_vec<<>>(mat_out, d_out, v_in); } +void cudaD_copy_cols_at_indices_from_vec(dim3 Gr, dim3 Bl, double *mat_out, + const double *v_in, + const MatrixIndexT_cuda* indices, + MatrixDim d_out) { + _copy_cols_at_indices_from_vec<<>>(mat_out, v_in, indices, d_out); +} + +void cudaF_copy_cols_at_indices_from_vec(dim3 Gr, dim3 Bl, float *mat_out, + const float *v_in, + const MatrixIndexT_cuda* indices, + MatrixDim d_out) { + _copy_cols_at_indices_from_vec<<>>(mat_out, v_in, indices, d_out); +} + void cudaF_diff_normalize_per_row(size_t Gr, size_t Bl, float *id, int id_stride, const float *iv, MatrixDim iv_dim, const float* od, diff --git a/src/cudamatrix/cu-kernels.h b/src/cudamatrix/cu-kernels.h index 96319a37652..58210e84aba 100644 --- a/src/cudamatrix/cu-kernels.h +++ b/src/cudamatrix/cu-kernels.h @@ -424,6 +424,16 @@ inline void cuda_copy_cols_from_vec(dim3 Gr, dim3 Bl, float *mat_out, MatrixDim d_out, const float *v_in) { cudaF_copy_cols_from_vec(Gr, Bl, mat_out, d_out, v_in); } +inline void cuda_copy_cols_at_indices_from_vec(dim3 Gr, dim3 Bl, double *mat_out, + const double *v_in, const MatrixIndexT_cuda* indices, + MatrixDim d_out) { + cudaD_copy_cols_at_indices_from_vec(Gr, Bl, mat_out, v_in, indices, d_out); +} +inline void cuda_copy_cols_at_indices_from_vec(dim3 Gr, dim3 Bl, float *mat_out, + const float *v_in, const MatrixIndexT_cuda* indices, + MatrixDim d_out) { + cudaF_copy_cols_at_indices_from_vec(Gr, Bl, mat_out, v_in, indices, d_out); +} inline void cuda_copy(dim3 Gr, dim3 Bl, double *y, const double *x, const int32_cuda *copy_from, MatrixDim d_out, MatrixDim d_in) { diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc index 2ed93fb9ac7..3ae3a444f4f 100644 --- a/src/cudamatrix/cu-matrix.cc +++ b/src/cudamatrix/cu-matrix.cc @@ -2245,6 +2245,34 @@ void CuMatrixBase::CopyColsFromVec(const CuVectorBase &rv) { } } +template +void CuMatrixBase::CopyColsFromVec(const CuVectorBase &v, + const CuArray &indices) { + KALDI_ASSERT(indices.Dim() == NumCols()); + KALDI_ASSERT(NumRows() == v.Dim()); +#if HAVE_CUDA == 1 + if (CuDevice::Instantiate().Enabled()) { + CuTimer tim; + // use 2D block (8x32) and large enough grid to cover matrix *this + // dimBlock.x need to be at least warpSize for coalesced memory access. + const int32 warpSize = 32; + dim3 dimBlock(warpSize, CU1DBLOCK / warpSize); + dim3 dimGrid(n_blocks(num_cols_, dimBlock.x), + n_blocks(num_rows_, dimBlock.y)); + cuda_copy_cols_at_indices_from_vec(dimGrid, dimBlock, Data(), v.Data(), + indices.Data(), Dim()); + CU_SAFE_CALL(cudaGetLastError()); + CuDevice::Instantiate().AccuProfile(__func__, tim); + } else +#endif + { + for (MatrixIndexT j = 0; j < NumCols(); j++) { + if (indices.Data()[j] != -1) + Mat().CopyColFromVec(v.Vec(), j); + } + } +} + template void CuMatrixBase::CopyColFromVec(const CuVectorBase &v, diff --git a/src/cudamatrix/cu-matrix.h b/src/cudamatrix/cu-matrix.h index c841a2f380a..62d7df47060 100644 --- a/src/cudamatrix/cu-matrix.h +++ b/src/cudamatrix/cu-matrix.h @@ -250,10 +250,17 @@ class CuMatrixBase { void CopyRowsFromVec(const VectorBase &v); /// Copies vector into matrix, column-by-column. - /// Note that rv.Dim() must either equal NumRows()*NumCols() or NumRows(); + /// Note that v.Dim() must either equal NumRows()*NumCols() or NumRows(); /// this has two modes of operation. void CopyColsFromVec(const CuVectorBase &v); + /// Copies vector into column i of matrix if indices[i] != -1, else keep + /// column i as is. + /// indices.size() must equal this->NumCols(), + /// and v.Dim() must equal this.NumRows() + void CopyColsFromVec(const CuVectorBase &v, + const CuArray &indices); + /// Copy vector into specific column of matrix. void CopyColFromVec(const CuVectorBase &v, const MatrixIndexT col); diff --git a/src/nnet3/nnet-chain-diagnostics.cc b/src/nnet3/nnet-chain-diagnostics.cc index 01246e26499..be83581044c 100644 --- a/src/nnet3/nnet-chain-diagnostics.cc +++ b/src/nnet3/nnet-chain-diagnostics.cc @@ -45,24 +45,42 @@ NnetChainComputeProb::NnetChainComputeProb( << "compute_deriv == false, use the other constructor."; } - if (!chain_config.silence_pdfs_str.empty()) { + if (chain_config.use_smbr_objective && + (chain_config.exclude_silence || chain_config.one_silence_class)) { + if (chain_config.silence_pdfs_str.empty()) { + KALDI_ERR << "--silence-pdfs is required if --exclude-silence or " + << "--one-silence-class is true."; + } + std::vector silence_pdfs; SplitStringToVector(chain_config.silence_pdfs_str, ":,", false, &silence_pdfs); int32 num_pdfs = nnet.OutputDim("output"); - std::vector indices(num_pdfs); - for (size_t i = 0; i < num_pdfs; i++) { - indices[i] = i; - } - - for (std::vector::iterator it = silence_pdfs.begin(); - it != silence_pdfs.end(); ++it) { - int32 pdf = std::atoi(it->c_str()); - if (pdf > num_pdfs) - KALDI_ERR << "Invalid pdf " << pdf << " in silence-pdfs " - << chain_config.silence_pdfs_str; - indices[pdf] = -1; + std::vector indices(num_pdfs, -1); + + if (chain_config.exclude_silence) { + for (size_t i = 0; i < num_pdfs; i++) { + indices[i] = i; + } + + for (std::vector::iterator it = silence_pdfs.begin(); + it != silence_pdfs.end(); ++it) { + int32 pdf = std::atoi(it->c_str()); + if (pdf > num_pdfs) + KALDI_ERR << "Invalid pdf " << pdf << " in silence-pdfs " + << chain_config.silence_pdfs_str; + indices[pdf] = -1; + } + } else { + for (std::vector::iterator it = silence_pdfs.begin(); + it != silence_pdfs.end(); ++it) { + int32 pdf = std::atoi(it->c_str()); + if (pdf > num_pdfs) + KALDI_ERR << "Invalid pdf " << pdf << " in silence-pdfs " + << chain_config.silence_pdfs_str; + indices[pdf] = pdf; + } } sil_indices_.Resize(num_pdfs); @@ -217,13 +235,17 @@ void NnetChainComputeProb::ProcessOutputs(const NnetChainExample &eg, &tot_like, &tot_l2_term, &tot_weight, (nnet_config_.compute_deriv ? &nnet_output_deriv : NULL), (use_xent ? &xent_deriv : NULL)); - + + BaseFloat objf_scale = 1.0; { unordered_map::iterator it = objective_scales_.find(sup.name); if (it != objective_scales_.end()) { + objf_scale = it->second; tot_like *= it->second; + tot_l2_term *= it->second; + tot_mmi_objf *= it->second; tot_weight *= it->second; if (nnet_config_.compute_deriv) nnet_output_deriv.Scale(it->second); @@ -242,11 +264,27 @@ void NnetChainComputeProb::ProcessOutputs(const NnetChainExample &eg, aux_objfs.push_back(tot_l2_term); if (chain_config_.use_smbr_objective) aux_objfs.push_back(tot_mmi_objf); - - ChainObjectiveInfo &totals = objf_info_[sup.name]; - totals.tot_weight += tot_weight; - totals.tot_like += tot_like; - totals.tot_aux_objfs.Add(aux_objfs); + + { + unordered_map::iterator it + = objf_info_.find(sup.name); + + if (it == objf_info_.end()) { + BaseFloat this_objf_scale = objf_scale; + std::vector aux_objf_scales(1, objf_scale); // for l2 term + if (chain_config_.use_smbr_objective) { + this_objf_scale *= chain_config_.smbr_factor; + aux_objf_scales.push_back(objf_scale * chain_config_.mmi_factor); + } + + ChainObjectiveInfo totals(this_objf_scale, aux_objf_scales); + it = objf_info_.insert(it, std::make_pair(sup.name, totals)); + } + + it->second.tot_weight += tot_weight; + it->second.tot_like += tot_like; + it->second.tot_aux_objfs.Add(aux_objfs); + } if (nnet_config_.compute_deriv) computer->AcceptInput(sup.name, &nnet_output_deriv); @@ -289,9 +327,14 @@ bool NnetChainComputeProb::PrintTotalStats() const { BaseFloat like = (info.tot_like / info.tot_weight); ObjectiveValues aux_objfs(info.tot_aux_objfs); - aux_objfs.Scale(1.0 / info.tot_weight); + aux_objfs.InvScale(info.tot_weight); BaseFloat tot_objf = like + aux_objfs.Sum(); - + + // Remove scales for the purpose of printing + if (info.objf_scale != 0.0) like /= info.objf_scale; + if (info.aux_objf_scales.size() > 0) + aux_objfs.InvScale(info.aux_objf_scales); + if (info.tot_aux_objfs.IsZero()) { KALDI_LOG << "Overall log-probability for '" << name << "' is " diff --git a/src/nnet3/nnet-chain-diagnostics.h b/src/nnet3/nnet-chain-diagnostics.h index 65456e5ec8d..ec45a9ad43d 100644 --- a/src/nnet3/nnet-chain-diagnostics.h +++ b/src/nnet3/nnet-chain-diagnostics.h @@ -36,9 +36,18 @@ namespace nnet3 { struct ChainObjectiveInfo { double tot_weight; double tot_like; + BaseFloat objf_scale; + std::vector aux_objf_scales; + ObjectiveValues tot_aux_objfs; ChainObjectiveInfo(): tot_weight(0.0), - tot_like(0.0) { } + tot_like(0.0), + objf_scale(1.0) { } + + ChainObjectiveInfo(BaseFloat objf_scale, + const std::vector &aux_objf_scales): + tot_weight(0.0), tot_like(0.0), + objf_scale(objf_scale), aux_objf_scales(aux_objf_scales) { } }; diff --git a/src/nnet3/nnet-chain-training.cc b/src/nnet3/nnet-chain-training.cc index 70113b14aaa..77b1b74096b 100644 --- a/src/nnet3/nnet-chain-training.cc +++ b/src/nnet3/nnet-chain-training.cc @@ -56,30 +56,48 @@ NnetChainTrainer::NnetChainTrainer(const NnetChainTrainingOptions &opts, } } - if (!opts.chain_config.silence_pdfs_str.empty()) { + if (opts.chain_config.use_smbr_objective && + (opts.chain_config.exclude_silence || opts.chain_config.one_silence_class)) { + if (opts.chain_config.silence_pdfs_str.empty()) { + KALDI_ERR << "--silence-pdfs is required if --exclude-silence or " + << "--one-silence-class is true."; + } + std::vector silence_pdfs; SplitStringToVector(opts.chain_config.silence_pdfs_str, ":,", false, &silence_pdfs); int32 num_pdfs = nnet->OutputDim("output"); - std::vector indices(num_pdfs); - for (size_t i = 0; i < num_pdfs; i++) { - indices[i] = i; - } - - for (std::vector::iterator it = silence_pdfs.begin(); - it != silence_pdfs.end(); ++it) { - int32 pdf = std::atoi(it->c_str()); - if (pdf > num_pdfs) - KALDI_ERR << "Invalid pdf " << pdf << " in silence-pdfs " - << opts.chain_config.silence_pdfs_str; - indices[pdf] = -1; + std::vector indices(num_pdfs, -1); + + if (opts.chain_config.exclude_silence) { + for (size_t i = 0; i < num_pdfs; i++) { + indices[i] = i; + } + + for (std::vector::iterator it = silence_pdfs.begin(); + it != silence_pdfs.end(); ++it) { + int32 pdf = std::atoi(it->c_str()); + if (pdf > num_pdfs) + KALDI_ERR << "Invalid pdf " << pdf << " in silence-pdfs " + << opts.chain_config.silence_pdfs_str; + indices[pdf] = -1; + } + } else { + for (std::vector::iterator it = silence_pdfs.begin(); + it != silence_pdfs.end(); ++it) { + int32 pdf = std::atoi(it->c_str()); + if (pdf > num_pdfs) + KALDI_ERR << "Invalid pdf " << pdf << " in silence-pdfs " + << opts.chain_config.silence_pdfs_str; + indices[pdf] = pdf; + } } sil_indices_.Resize(num_pdfs); sil_indices_.CopyFromVec(indices); } - + if (!opts.nnet_config.objective_scales_str.empty()) { std::vector objectives_for_outputs; SplitStringToVector(opts.nnet_config.objective_scales_str, ",", false, @@ -237,12 +255,16 @@ void NnetChainTrainer::ProcessOutputs(bool is_backstitch_step2, (use_xent ? &xent_deriv : NULL)); } + BaseFloat objf_scale = 1.0; { unordered_map::iterator it = objective_scales_.find(sup.name); if (it != objective_scales_.end()) { + objf_scale = it->second; tot_objf *= it->second; + tot_l2_term *= it->second; + tot_mmi_objf *= it->second; tot_weight *= it->second; nnet_output_deriv.Scale(it->second); } @@ -256,12 +278,14 @@ void NnetChainTrainer::ProcessOutputs(bool is_backstitch_step2, // computation. note, xent_objf has a factor of '.supervision.weight' BaseFloat xent_objf = TraceMatMat(xent_output, xent_deriv, kTrans); - unordered_map::iterator it = - objective_scales_.find(xent_name); - - if (it != objective_scales_.end()) { - xent_objf *= it->second; - xent_deriv.Scale(it->second); + { + unordered_map::iterator it = + objective_scales_.find(xent_name); + + if (it != objective_scales_.end()) { + xent_objf *= it->second; + xent_deriv.Scale(it->second); + } } objf_info_[xent_name + suffix].UpdateStats(xent_name + suffix, @@ -277,29 +301,45 @@ void NnetChainTrainer::ProcessOutputs(bool is_backstitch_step2, xent_deriv.MulRowsVec(cu_deriv_weights); } - if (opts_.accumulate_avg_deriv && - objf_info_[sup.name + suffix].deriv_sum.Dim() == 0) - objf_info_[sup.name + suffix].deriv_sum.Resize(nnet_output.NumCols()); - - if (objf_info_[sup.name + suffix].deriv_sum.Dim() > 0) - objf_info_[sup.name + suffix].deriv_sum.AddRowSumMat( - 1.0, nnet_output_deriv, 1.0); - - computer->AcceptInput(sup.name, &nnet_output_deriv); - std::vector objective_values; objective_values.push_back(tot_l2_term); if (opts_.chain_config.use_smbr_objective) objective_values.push_back(tot_mmi_objf); - objf_info_[sup.name + suffix].UpdateStats(sup.name + suffix, - opts_.nnet_config.print_interval, - num_minibatches_processed_, - tot_weight, tot_objf, objective_values); + { + unordered_map::iterator it + = objf_info_.find(sup.name + suffix); + + if (it == objf_info_.end()) { + BaseFloat this_objf_scale = objf_scale; + std::vector aux_objf_scales(1, objf_scale); // l2_term + if (opts_.chain_config.use_smbr_objective) { + this_objf_scale *= opts_.chain_config.smbr_factor; + aux_objf_scales.push_back(objf_scale * opts_.chain_config.mmi_factor); + } + + ObjectiveFunctionInfo totals(objf_scale, aux_objf_scales); + it = objf_info_.insert(it, std::make_pair(sup.name + suffix, totals)); + } + + if (opts_.accumulate_avg_deriv && + it->second.deriv_sum.Dim() == 0) + it->second.deriv_sum.Resize(nnet_output.NumCols()); + + if (it->second.deriv_sum.Dim() > 0) + it->second.deriv_sum.AddRowSumMat(1.0, nnet_output_deriv, 1.0); + + it->second.UpdateStats(sup.name + suffix, + opts_.nnet_config.print_interval, + num_minibatches_processed_, + tot_weight, tot_objf, objective_values); + } + + computer->AcceptInput(sup.name, &nnet_output_deriv); if (use_xent) { xent_deriv.Scale(opts_.chain_config.xent_regularize); - if (opts_.accumulate_avg_deriv && + if (opts_.accumulate_avg_deriv && objf_info_[xent_name + suffix].deriv_sum.Dim() == 0) objf_info_[xent_name + suffix].deriv_sum.Resize(nnet_output.NumCols()); if (objf_info_[xent_name + suffix].deriv_sum.Dim() > 0) diff --git a/src/nnet3/nnet-training.cc b/src/nnet3/nnet-training.cc index 5e1e01b1106..a35b24077d2 100644 --- a/src/nnet3/nnet-training.cc +++ b/src/nnet3/nnet-training.cc @@ -242,8 +242,27 @@ void ObjectiveValues::Add(const ObjectiveValues &other) { void ObjectiveValues::Scale(BaseFloat scale) { for (std::vector::iterator it = objective_values.begin(); - it != objective_values.end(); ++it) { + it != objective_values.end(); ++it) *it *= scale; +} + +void ObjectiveValues::InvScale(BaseFloat inv_scale) { + for (std::vector::iterator it = objective_values.begin(); + it != objective_values.end(); ++it) { + if (inv_scale != 0.0) + *it /= inv_scale; + else + KALDI_ASSERT(*it == 0.0); + } +} + +void ObjectiveValues::InvScale(const std::vector &inv_scales) { + KALDI_ASSERT(objective_values.size() == inv_scales.size()); + for (size_t i = 0; i < objective_values.size(); i++) { + if (inv_scales[i] != 0.0) + objective_values[i] /= inv_scales[i]; + else + KALDI_ASSERT(objective_values[i] == 0.0); } } @@ -348,15 +367,20 @@ bool ObjectiveFunctionInfo::PrintTotalStats(const std::string &name) const { ObjectiveValues aux_objfs(tot_aux_objfs); aux_objfs.Scale(1.0 / tot_weight); BaseFloat sum_objf = objf + aux_objfs.Sum(); + + // Remove scales for the purpose of printing + if (objf_scale != 0.0) objf /= objf_scale; + aux_objfs.InvScale(aux_objf_scales); + if (tot_aux_objfs.IsZero()) { KALDI_LOG << "Overall average objective function for '" << name << "' is " - << (tot_objf / tot_weight) << " over " << tot_weight << " frames."; + << objf << " over " << tot_weight << " frames."; } else { KALDI_LOG << "Overall average objective function for '" << name << "' is " << objf << " + " << aux_objfs.Str() << " = " << sum_objf << " over " << tot_weight << " frames."; } - + if (deriv_sum.Dim() > 0) { Vector deriv_avg(deriv_sum); deriv_avg.Scale(1.0 / tot_weight); @@ -366,6 +390,7 @@ bool ObjectiveFunctionInfo::PrintTotalStats(const std::string &name) const { KALDI_LOG << "[this line is to be parsed by a script:] " << "log-prob-per-frame=" << objf; + return (tot_weight != 0.0); } diff --git a/src/nnet3/nnet-training.h b/src/nnet3/nnet-training.h index 02ce0ff550a..a54d43cf251 100644 --- a/src/nnet3/nnet-training.h +++ b/src/nnet3/nnet-training.h @@ -124,6 +124,10 @@ struct ObjectiveValues { void Scale(BaseFloat scale); + void InvScale(BaseFloat inv_scale); + + void InvScale(const std::vector &inv_scales); + void Reset() { Scale(0.0); } bool IsZero() const; @@ -144,7 +148,7 @@ struct ObjectiveFunctionInfo { // 'current_phase'. double tot_weight; double tot_objf; - + // A struct used to store 'auxiliary' objective function values // that is optional- may be used when things like regularization are being // used. @@ -155,12 +159,24 @@ struct ObjectiveFunctionInfo { ObjectiveValues tot_aux_objfs_this_phase; CuVector deriv_sum; + + BaseFloat objf_scale; + std::vector aux_objf_scales; ObjectiveFunctionInfo(): current_phase(0), minibatches_this_phase(0), tot_weight(0.0), tot_objf(0.0), - tot_weight_this_phase(0.0), tot_objf_this_phase(0.0) { } + tot_weight_this_phase(0.0), tot_objf_this_phase(0.0), + objf_scale(1.0) { } + + ObjectiveFunctionInfo(BaseFloat objf_scale, + const std::vector aux_objf_scales): + current_phase(0), + minibatches_this_phase(0), + tot_weight(0.0), tot_objf(0.0), + tot_weight_this_phase(0.0), tot_objf_this_phase(0.0), + objf_scale(objf_scale), aux_objf_scales(aux_objf_scales) { } // This function updates the stats and, if the phase has just changed, // prints a message indicating progress. The phase equals From 803a5767c98006f2866aaf5ae4aa09b22730ddb3 Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Mon, 22 Jan 2018 11:39:59 -0500 Subject: [PATCH 112/174] semisup-smbr: Some script level changes for smbr --- .../s5/steps/libs/nnet3/report/log_parse.py | 41 +++++++++++++------ egs/wsj/s5/steps/libs/nnet3/train/common.py | 10 ----- egs/wsj/s5/steps/nnet3/chain/train.py | 6 +++ .../s5/steps/nnet3/report/generate_plots.py | 19 +++++++-- 4 files changed, 51 insertions(+), 25 deletions(-) diff --git a/egs/wsj/s5/steps/libs/nnet3/report/log_parse.py b/egs/wsj/s5/steps/libs/nnet3/report/log_parse.py index 6a66c60dd6b..3834094e84c 100755 --- a/egs/wsj/s5/steps/libs/nnet3/report/log_parse.py +++ b/egs/wsj/s5/steps/libs/nnet3/report/log_parse.py @@ -333,7 +333,7 @@ def parse_train_logs(exp_dir): return train_times -def parse_prob_logs(exp_dir, key='accuracy', output="output"): +def parse_prob_logs(exp_dir, key='accuracy', output="output", field=0): train_prob_files = "%s/log/compute_prob_train.*.log" % (exp_dir) valid_prob_files = "%s/log/compute_prob_valid.*.log" % (exp_dir) train_prob_strings = common_lib.get_command_stdout( @@ -351,11 +351,28 @@ def parse_prob_logs(exp_dir, key='accuracy', output="output"): # Overall log-probability for 'output' is -0.307255 per frame, over 20000 # frames. - parse_regex = re.compile( - ".*compute_prob_.*\.([0-9]+).log:LOG " - ".nnet3.*compute-prob.*:PrintTotalStats..:" - "nnet.*diagnostics.cc:[0-9]+. Overall ([a-zA-Z\-]+) for " - "'{output}'.*is ([0-9.\-e]+) .*per frame".format(output=output)) + if field == 0: + parse_regex = re.compile( + ".*compute_prob_.*\.([0-9]+).log:LOG " + ".nnet3.*compute-prob.*:PrintTotalStats..:" + "nnet.*diagnostics.cc:[0-9]+. Overall ([a-zA-Z\-]+) for " + "'{output}'.*is ([0-9.\-e]+) .*per frame".format(output=output)) + else: + other_objfs_str = "" + for i in range(field): + other_objfs_str += "[0-9.\-e]+ [+] "; + + logger.info(".*compute_prob_.*\.([0-9]+).log:LOG " + ".nnet3.*compute-prob.*:PrintTotalStats..:" + "nnet.*diagnostics.cc:[0-9]+. Overall ([a-zA-Z\-]+) for " + "'{output}'.*is {other_objfs}([0-9.\-e]+) .*per frame".format( + output=output, other_objfs=other_objfs_str)) + parse_regex = re.compile( + ".*compute_prob_.*\.([0-9]+).log:LOG " + ".nnet3.*compute-prob.*:PrintTotalStats..:" + "nnet.*diagnostics.cc:[0-9]+. Overall ([a-zA-Z\-]+) for " + "'{output}'.*is {other_objfs}([0-9.\-e]+) .*per frame".format( + output=output, other_objfs=other_objfs_str)) train_loss = {} valid_loss = {} @@ -367,8 +384,8 @@ def parse_prob_logs(exp_dir, key='accuracy', output="output"): if groups[1] == key: train_loss[int(groups[0])] = groups[2] if not train_loss: - raise KaldiLogParseException("Could not find any lines with {k} in " - " {l}".format(k=key, l=train_prob_files)) + raise KaldiLogParseException("Could not find any values at field {f} with {k} in " + " {l}".format(f=field, k=key, l=train_prob_files)) for line in valid_prob_strings.split('\n'): mat_obj = parse_regex.search(line) @@ -378,8 +395,8 @@ def parse_prob_logs(exp_dir, key='accuracy', output="output"): valid_loss[int(groups[0])] = groups[2] if not valid_loss: - raise KaldiLogParseException("Could not find any lines with {k} in " - " {l}".format(k=key, l=valid_prob_files)) + raise KaldiLogParseException("Could not find any values at field {f} with {k} in " + " {l}".format(f=field, k=key, l=valid_prob_files)) iters = list(set(valid_loss.keys()).intersection(train_loss.keys())) if not iters: @@ -392,7 +409,7 @@ def parse_prob_logs(exp_dir, key='accuracy', output="output"): -def generate_acc_logprob_report(exp_dir, key="accuracy", output="output"): +def generate_acc_logprob_report(exp_dir, key="accuracy", output="output", field=0): try: times = parse_train_logs(exp_dir) except: @@ -403,7 +420,7 @@ def generate_acc_logprob_report(exp_dir, key="accuracy", output="output"): report = [] report.append("%Iter\tduration\ttrain_loss\tvalid_loss\tdifference") try: - data = list(parse_prob_logs(exp_dir, key, output)) + data = list(parse_prob_logs(exp_dir, key, output, field)) except: tb = traceback.format_exc() logger.warning("Error getting info from logs, exception was: " + tb) diff --git a/egs/wsj/s5/steps/libs/nnet3/train/common.py b/egs/wsj/s5/steps/libs/nnet3/train/common.py index e921ee5afc1..7a91cd460ec 100644 --- a/egs/wsj/s5/steps/libs/nnet3/train/common.py +++ b/egs/wsj/s5/steps/libs/nnet3/train/common.py @@ -452,16 +452,6 @@ def verify_egs_dir(egs_dir, feat_dim, ivector_dim, ivector_extractor_id, egs_left_context, egs_right_context, left_context, right_context)) - if left_context_initial == -1: - left_context_initial = left_context - if right_context_final == -1: - right_context_final = right_context - if egs_left_context_initial == -1: - egs_left_context_initial = egs_left_context - if egs_right_context_final == -1: - egs_right_context_final = egs_right_context - - # the condition on the initial/final context is an equality condition, # not an inequality condition, as there is no mechanism to 'correct' the # context (by subtracting context) while copying the egs, like there is diff --git a/egs/wsj/s5/steps/nnet3/chain/train.py b/egs/wsj/s5/steps/nnet3/chain/train.py index 1a4ddb477ec..6a0d772deb3 100755 --- a/egs/wsj/s5/steps/nnet3/chain/train.py +++ b/egs/wsj/s5/steps/nnet3/chain/train.py @@ -104,6 +104,10 @@ def get_args(): dest='left_deriv_truncate', default=None, help="Deprecated. Kept for back compatibility") + parser.add_argument("--chain.smbr-extra-opts", type=str, + dest='smbr_extra_opts', default=None, + action=common_lib.NullstrToNoneAction, + help="Some additional options related to sMBR") parser.add_argument("--chain.smbr-factor-schedule", type=str, dest='smbr_factor_schedule', default=None, action=common_lib.NullstrToNoneAction, @@ -567,6 +571,8 @@ def train(args, run_opts): objective_opts += " --use-smbr-objective" if silence_pdfs is not None: objective_opts += " --silence-pdfs=" + silence_pdfs + if args.smbr_extra_opts is not None: + objective_opts += " " + args.smbr_extra_opts if args.mmi_factor_schedule is not None: mmi_factor = common_train_lib.get_schedule_value( diff --git a/egs/wsj/s5/steps/nnet3/report/generate_plots.py b/egs/wsj/s5/steps/nnet3/report/generate_plots.py index 6f7987c425f..7d9aad94fcc 100755 --- a/egs/wsj/s5/steps/nnet3/report/generate_plots.py +++ b/egs/wsj/s5/steps/nnet3/report/generate_plots.py @@ -158,7 +158,7 @@ def latex_compliant_name(name_string): def generate_acc_logprob_plots(exp_dir, output_dir, plot, key='accuracy', file_basename='accuracy', comparison_dir=None, - start_iter=1, latex_report=None, output_name='output'): + start_iter=1, latex_report=None, output_name='output', field=0): assert start_iter >= 1 @@ -171,7 +171,8 @@ def generate_acc_logprob_plots(exp_dir, output_dir, plot, key='accuracy', index = 0 for dir in dirs: [report, times, data] = log_parse.generate_acc_logprob_report(dir, key, - output_name) + output_name, field) + if index == 0: # this is the main experiment directory with open("{0}/{1}.log".format(output_dir, @@ -184,7 +185,7 @@ def generate_acc_logprob_plots(exp_dir, output_dir, plot, key='accuracy', if data.shape[0] == 0: logger.warning("Couldn't find any rows for the" "accuracy/log-probability plot, not generating it") - return + continue data = data[data[:, 0] >= start_iter, :] plot_handle, = plt.plot(data[:, 0], data[:, 1], color=color_val, linestyle="--", @@ -688,6 +689,18 @@ def generate_plots(exp_dir, output_dir, output_names, comparison_dir=None, key='log-probability', file_basename='log_probability', comparison_dir=comparison_dir, start_iter=start_iter, latex_report=latex_report, output_name=output_name) + elif objective_type == "chain-smbr": + generate_acc_logprob_plots( + exp_dir, output_dir, g_plot, + key='log-probability', file_basename='smbr', + comparison_dir=comparison_dir, start_iter=start_iter, + latex_report=latex_report, output_name=output_name) + generate_acc_logprob_plots( + exp_dir, output_dir, g_plot, + key='log-probability', file_basename='log_probability', + comparison_dir=comparison_dir, start_iter=start_iter, + latex_report=latex_report, output_name=output_name, + field=2) else: logger.info("Generating " + objective_type + " objective plots") generate_acc_logprob_plots( From 2839e2e65efe0ec6356f520fdbf00a95c4d1c436 Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Mon, 22 Jan 2018 11:41:13 -0500 Subject: [PATCH 113/174] semisup: Add num-copies option for combine_egs.sh --- .../steps/nnet3/multilingual/combine_egs.sh | 63 +++++++++++++++++-- 1 file changed, 57 insertions(+), 6 deletions(-) diff --git a/egs/wsj/s5/steps/nnet3/multilingual/combine_egs.sh b/egs/wsj/s5/steps/nnet3/multilingual/combine_egs.sh index 75a49e1004e..1e708d58915 100755 --- a/egs/wsj/s5/steps/nnet3/multilingual/combine_egs.sh +++ b/egs/wsj/s5/steps/nnet3/multilingual/combine_egs.sh @@ -24,8 +24,15 @@ frames_per_iter=400000 # this is the target number of egs in each archive of egs # it egs_per_iter. This is just a guideline; it will pick # a number that divides the number of samples in the # entire data. -lang2weight= # array of weights one per input languge to scale example's output +lang2weight= # comma-separated list of weights one per + # input languge to scale example's output # w.r.t its input language during training. +lang2num_copies= # comma-separated list of number of copies per + # input language + # This is another way to scale the effect of + # a langauge especially when the language has + # relatively very little data. + allocate_opts= egs_prefix=egs. stage=0 @@ -55,6 +62,15 @@ if [ ${#args[@]} != $[$num_langs+1] ]; then exit 1; fi +num_copies_per_lang= +if [ ! -z "$lang2num_copies" ]; then + IFS=, read -r -a num_copies_per_lang <<< $lang2num_copies + if [ ${#num_copies_per_lang[@]} -ne $num_langs ]; then + echo "$0: --lang2num-copies must be an array of num-langs=$num_langs integers" + exit 1 + fi +fi + required="${egs_prefix}scp combine.scp train_diagnostic.scp valid_diagnostic.scp" frames_per_eg_list= train_scp_list= @@ -64,7 +80,7 @@ combine_scp_list= # read paramter from $egs_dir[0]/info and cmvn_opts # to write in multilingual egs_dir. -check_params="info/feat_dim info/ivector_dim info/left_context info/right_context cmvn_opts" +check_params="info/feat_dim info/ivector_dim info/left_context info/right_context info/left_context_initial info/right_context_final cmvn_opts" ivec_dim=`cat ${args[0]}/info/ivector_dim` if [ $ivec_dim -ne 0 ];then check_params="$check_params info/final.ie.id"; fi @@ -74,6 +90,8 @@ done cat ${args[0]}/cmvn_opts > $megs_dir/cmvn_opts || exit 1; # caution: the top-level nnet training cp ${args[0]}/info/frames_per_eg $megs_dir/info/frames_per_eg || exit 1; +declare -a multi_egs_dir + for lang in $(seq 0 $[$num_langs-1]);do multi_egs_dir[$lang]=${args[$lang]} for f in $required; do @@ -81,10 +99,43 @@ for lang in $(seq 0 $[$num_langs-1]);do echo "$0: no such file ${multi_egs_dir[$lang]}/$f." && exit 1; fi done - train_scp_list="$train_scp_list ${args[$lang]}/${egs_prefix}scp" - train_diagnostic_scp_list="$train_diagnostic_scp_list ${args[$lang]}/train_diagnostic.scp" - valid_diagnostic_scp_list="$valid_diagnostic_scp_list ${args[$lang]}/valid_diagnostic.scp" - combine_scp_list="$combine_scp_list ${args[$lang]}/combine.scp" + + if [ -z "$lang2num_copies" ] || [ ${num_copies_per_lang[$lang]} -eq 1 ]; then + train_scp_list="$train_scp_list ${multi_egs_dir[$lang]}/${egs_prefix}scp" + train_diagnostic_scp_list="$train_diagnostic_scp_list ${multi_egs_dir[$lang]}/train_diagnostic.scp" + valid_diagnostic_scp_list="$valid_diagnostic_scp_list ${multi_egs_dir[$lang]}/valid_diagnostic.scp" + combine_scp_list="$combine_scp_list ${multi_egs_dir[$lang]}/combine.scp" + else + rm -f $megs_dir/lang${lang}_${egs_prefix}scp $megs_dir/lang${lang}_train_diagnostic.scp \ + $megs_dir/lang${lang}_valid_diagnostic.scp $megs_dir/lang${lang}_combine.scp + + if [ `echo ${num_copies_per_lang[$lang]} | awk "{print int($num_copies_per_lang)}"` != ${num_copies_per_lang[$lang]} ]; then + echo "$0: Expected --lang2num-copies to have only integers; " + echo "$0: got ${num_copies_per_lang[$lang]} for language $lang" + exit 1 + fi + + for i in `seq ${num_copies_per_lang[$lang]}`; do + awk -v i=$i '{print $1"-"i" "$2}' ${multi_egs_dir[$lang]}/${egs_prefix}scp >> \ + $megs_dir/lang${lang}_${egs_prefix}scp + awk -v i=$i '{print $1"-"i" "$2}' ${multi_egs_dir[$lang]}/train_diagnostic.scp >> \ + $megs_dir/lang${lang}_train_diagnostic.scp + awk -v i=$i '{print $1"-"i" "$2}' ${multi_egs_dir[$lang]}/valid_diagnostic.scp >> \ + $megs_dir/lang${lang}_valid_diagnostic.scp + awk -v i=$i '{print $1"-"i" "$2}' ${multi_egs_dir[$lang]}/combine.scp >> \ + $megs_dir/lang${lang}_combine.scp + done + + if [ $(head -n1 $megs_dir/lang${lang}_${egs_prefix}scp | wc -w) -ne 2 ]; then + echo "$0: Incorrect format in $megs_dir/lang${lang}_${egs_prefix}scp; something went wrong!" + exit 1 + fi + + train_scp_list="$train_scp_list $megs_dir/lang${lang}_${egs_prefix}scp" + train_diagnostic_scp_list="$train_diagnostic_scp_list $megs_dir/lang${lang}_train_diagnostic.scp" + valid_diagnostic_scp_list="$valid_diagnostic_scp_list $megs_dir/lang${lang}_valid_diagnostic.scp" + combine_scp_list="$combine_scp_list $megs_dir/lang${lang}_combine.scp" + fi this_frames_per_eg=$(cat ${args[$lang]}/info/frames_per_eg | \ awk -F, '{for (i=1; i<=NF; i++) sum += $i;} END{print int(sum / NF)}') # use average frames-per-eg From 63858b86004e2903c890f51db85936523d96e61d Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Mon, 22 Jan 2018 11:45:14 -0500 Subject: [PATCH 114/174] semisup-smbr: Add more run-level scripts --- .../s5/local/fisher_train_lms_pocolm.sh | 31 +- egs/fisher_english/s5/local/run_unk_model.sh | 16 +- ...un_tdnn_100k_250k_semisupervised_conf_a.sh | 24 +- ...un_tdnn_100k_250k_semisupervised_conf_b.sh | 22 +- ...un_tdnn_100k_500k_semisupervised_conf_a.sh | 472 ++++++++++++++++ ...un_tdnn_100k_500k_semisupervised_conf_b.sh | 472 ++++++++++++++++ .../chain/tuning/run_tdnn_100k_c_oracle.sh | 22 +- .../semisup/chain/tuning/run_tdnn_100k_d.sh | 2 + .../semisup/chain/tuning/run_tdnn_100k_f.sh | 2 +- ...nn_lstm_100k_250k_semisupervised_conf_a.sh | 513 +++++++++++++++++ ...nn_lstm_100k_250k_semisupervised_conf_c.sh | 516 +++++++++++++++++ ...nn_lstm_100k_500k_semisupervised_conf_a.sh | 518 ++++++++++++++++++ ...nn_lstm_100k_500k_semisupervised_conf_b.sh | 517 +++++++++++++++++ ...nn_lstm_100k_500k_semisupervised_conf_c.sh | 513 +++++++++++++++++ .../chain/tuning/run_tdnn_lstm_100k_b.sh | 225 ++++++++ .../chain/tuning/run_tdnn_lstm_100k_c.sh | 225 ++++++++ .../tuning/run_tdnn_lstm_100k_oracle_b.sh | 244 +++++++++ .../tuning/run_tdnn_lstm_100k_oracle_c.sh | 247 +++++++++ .../chain/tuning/run_tdnn_lstm_100k_smbr_a.sh | 228 ++++++++ .../chain/tuning/run_tdnn_lstm_15k_a.sh | 232 ++++++++ .../chain/tuning/run_tdnn_lstm_15k_b.sh | 232 ++++++++ .../tuning/run_tdnn_lstm_15k_oracle_a.sh | 244 +++++++++ ...run_tdnn_lstm_15k_semisupervised_conf_a.sh | 507 +++++++++++++++++ ...run_tdnn_lstm_15k_semisupervised_conf_b.sh | 509 +++++++++++++++++ ...run_tdnn_lstm_15k_semisupervised_conf_c.sh | 507 +++++++++++++++++ .../chain/tuning/run_tdnn_lstm_15k_smbr_a.sh | 232 ++++++++ .../chain/tuning/run_tdnn_lstm_50k_a.sh | 232 ++++++++ .../chain/tuning/run_tdnn_lstm_50k_b.sh | 232 ++++++++ ...run_tdnn_lstm_50k_semisupervised_conf_a.sh | 511 +++++++++++++++++ ...run_tdnn_lstm_50k_semisupervised_conf_b.sh | 511 +++++++++++++++++ .../s5/local/semisup/run_100k.sh | 34 +- .../s5/local/semisup/run_10k.sh | 27 +- 32 files changed, 8755 insertions(+), 64 deletions(-) create mode 100644 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_500k_semisupervised_conf_a.sh create mode 100644 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_500k_semisupervised_conf_b.sh create mode 100644 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_250k_semisupervised_conf_a.sh create mode 100644 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_250k_semisupervised_conf_c.sh create mode 100644 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_500k_semisupervised_conf_a.sh create mode 100644 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_500k_semisupervised_conf_b.sh create mode 100644 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_500k_semisupervised_conf_c.sh create mode 100755 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_b.sh create mode 100755 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_c.sh create mode 100755 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_oracle_b.sh create mode 100755 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_oracle_c.sh create mode 100755 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_smbr_a.sh create mode 100755 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_15k_a.sh create mode 100755 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_15k_b.sh create mode 100755 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_15k_oracle_a.sh create mode 100644 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_15k_semisupervised_conf_a.sh create mode 100644 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_15k_semisupervised_conf_b.sh create mode 100644 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_15k_semisupervised_conf_c.sh create mode 100755 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_15k_smbr_a.sh create mode 100755 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_50k_a.sh create mode 100755 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_50k_b.sh create mode 100644 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_50k_semisupervised_conf_a.sh create mode 100644 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_50k_semisupervised_conf_b.sh diff --git a/egs/fisher_english/s5/local/fisher_train_lms_pocolm.sh b/egs/fisher_english/s5/local/fisher_train_lms_pocolm.sh index 0152a64ae01..a9a76f7f775 100755 --- a/egs/fisher_english/s5/local/fisher_train_lms_pocolm.sh +++ b/egs/fisher_english/s5/local/fisher_train_lms_pocolm.sh @@ -146,8 +146,21 @@ fi if [ $stage -le 2 ]; then echo "$0: pruning the LM (to larger size)" # Using 5 million n-grams for a big LM for rescoring purposes. - prune_lm_dir.py --target-num-ngrams=$num_ngrams_large --initial-threshold=0.02 ${unpruned_lm_dir} ${dir}/data/lm_${order}_prune_big - + prune_lm_dir.py --target-num-ngrams=$num_ngrams_large --initial-threshold=0.02 ${unpruned_lm_dir} ${dir}/data/lm_${order}_prune_big \ + 2> >(tee -a ${dir}/data/lm_${order}_prune_big/prune_lm.log >&2) || true + + if [ ! -f ${dir}/data/lm_${order}_prune_big/metaparameters ]; then + if [ -z `tail ${dir}/data/lm_${order}_prune_big/prune_lm.log | grep "can not do any pruning"` ]; then + echo "$0: LM could not be pruned. Something went wrong!" + exit 1 + fi + + mkdir -p ${dir}/data/arpa + format_arpa_lm.py ${unpruned_lm_dir} | gzip -c > ${dir}/data/arpa/${order}gram_small.arpa.gz + echo "$0: No pruning necessary as num-ngrams is less than target" + exit 0 + fi + get_data_prob.py ${dir}/data/test.txt ${dir}/data/lm_${order}_prune_big 2>&1 | grep -F '[perplexity' | tee ${dir}/data/lm_${order}_prune_big/perplexity_test.log get_data_prob.py ${dir}/data/real_dev_set.txt ${dir}/data/lm_${order}_prune_big 2>&1 | grep -F '[perplexity' | tee ${dir}/data/lm_${order}_prune_big/perplexity_real_dev_set.log @@ -160,7 +173,19 @@ if [ $stage -le 3 ]; then echo "$0: pruning the LM (to smaller size)" # Using 3 million n-grams for a smaller LM for graph building. Prune from the # bigger-pruned LM, it'll be faster. - prune_lm_dir.py --target-num-ngrams=$num_ngrams_small ${dir}/data/lm_${order}_prune_big ${dir}/data/lm_${order}_prune_small + prune_lm_dir.py --target-num-ngrams=$num_ngrams_small ${dir}/data/lm_${order}_prune_big ${dir}/data/lm_${order}_prune_small \ + 2> >(tee -a ${dir}/data/lm_${order}_prune_small/prune_lm.log >&2) || true + + if [ ! -f ${dir}/data/lm_${order}_prune_small/metaparameters ]; then + if [ -z `tail ${dir}/data/lm_${order}_prune_small/prune_lm.log | grep "can not do any pruning"` ]; then + echo "$0: LM could not be pruned. Something went wrong!" + exit 1 + fi + + ln -s ${order}gram_big.arpa.gz $dir/data/arpa/${order}gram_small.arpa.gz + exit 0 + fi + get_data_prob.py ${dir}/data/test.txt ${dir}/data/lm_${order}_prune_small 2>&1 | grep -F '[perplexity' | tee ${dir}/data/lm_${order}_prune_small/perplexity_test.log diff --git a/egs/fisher_english/s5/local/run_unk_model.sh b/egs/fisher_english/s5/local/run_unk_model.sh index 5e390549cc3..d4f1bbc2be6 100755 --- a/egs/fisher_english/s5/local/run_unk_model.sh +++ b/egs/fisher_english/s5/local/run_unk_model.sh @@ -2,8 +2,6 @@ # Copyright 2017 Vimal Manohar -lang_dirs= - utils/lang/make_unk_lm.sh data/local/dict exp/unk_lang_model || exit 1 utils/prepare_lang.sh \ @@ -15,13 +13,13 @@ utils/prepare_lang.sh \ # keeps the graph compact after adding the unk model (we only have to add one # copy of it). -for lang_dir in $lang_dirs; do - rm -r ${lang_dir}_unk 2>/dev/null || true - mkdir -p ${lang_dir}_unk - cp -r data/lang_unk ${lang_dir}_unk - if [ -f ${lang_dir}/G.fst ]; then cp ${lang_dir}/G.fst ${lang_dir}_unk/G.fst; fi - if [ -f ${lang_dir}/G.carpa ]; then cp ${lang_dir}/G.carpa ${lang_dir}_unk/G.carpa; fi -done +# for lang_dir in $lang_dirs; do +# rm -r ${lang_dir}_unk 2>/dev/null || true +# mkdir -p ${lang_dir}_unk +# cp -r data/lang_unk ${lang_dir}_unk +# if [ -f ${lang_dir}/G.fst ]; then cp ${lang_dir}/G.fst ${lang_dir}_unk/G.fst; fi +# if [ -f ${lang_dir}/G.carpa ]; then cp ${lang_dir}/G.carpa ${lang_dir}_unk/G.carpa; fi +# done exit 0 diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_250k_semisupervised_conf_a.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_250k_semisupervised_conf_a.sh index b1b29be6026..744acd3e386 100644 --- a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_250k_semisupervised_conf_a.sh +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_250k_semisupervised_conf_a.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!bin/bash # Unsupervised set: train_unsup100k_250k # unsup_frames_per_eg=150 @@ -97,7 +97,6 @@ fi lang=data/lang_chain unsup_decode_lang=data/lang_poco_test_sup100k -unsup_rescore_lang=${unsup_decode_lang}_big unsup_decode_graph_affix=_poco_sup100k test_lang=data/lang_poco_test @@ -153,21 +152,16 @@ for dset in $unsupervised_set; do --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_sp_hires \ --scoring-opts "--min-lmwt 10 --max-lmwt 10" --determinize-opts "--word-determinize=false" \ $graphdir data/${dset}_sp_hires $chaindir/decode_${dset}_sp${decode_affix} - fi - - if [ $stage -le 6 ]; then - steps/lmrescore_const_arpa_undeterminized.sh --cmd "$decode_cmd" \ - --write-compact false --acwt 0.1 --beam 8.0 --skip-scoring true \ - $unsup_decode_lang $unsup_rescore_lang \ - data/${dset}_sp_hires \ - $chaindir/decode_${dset}_sp${decode_affix} \ - $chaindir/decode_${dset}_sp${decode_affix}_big - ln -sf ../final.mdl $chaindir/decode_${dset}_sp${decode_affix}_big/ || true + ln -sf ../final.mdl $chaindir/decode_${dset}_sp${decode_affix}/ || true fi done -decode_affix=${decode_affix}_big +frame_subsampling_factor=1 +if [ -f $chaindir/frame_subsampling_factor ]; then + frame_subsampling_factor=`cat $chaindir/frame_subsampling_factor` +fi + if [ $stage -le 8 ]; then steps/best_path_weights.sh --cmd "${train_cmd}" --acwt 0.1 \ data/${unsupervised_set}_sp_hires $lang \ @@ -176,10 +170,6 @@ if [ $stage -le 8 ]; then echo $frame_subsampling_factor > $chaindir/best_path_${unsupervised_set}_sp${decode_affix}/frame_subsampling_factor fi -frame_subsampling_factor=1 -if [ -f $chaindir/frame_subsampling_factor ]; then - frame_subsampling_factor=`cat $chaindir/frame_subsampling_factor` -fi cmvn_opts=`cat $chaindir/cmvn_opts` || exit 1 sup_ali_dir=$exp/tri4a diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_250k_semisupervised_conf_b.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_250k_semisupervised_conf_b.sh index cd76a4f4e76..1308339ad93 100644 --- a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_250k_semisupervised_conf_b.sh +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_250k_semisupervised_conf_b.sh @@ -97,7 +97,6 @@ fi lang=data/lang_chain_unk unsup_decode_lang=data/lang_poco_test_sup100k_unk -unsup_rescore_lang=${unsup_decode_lang}_big unsup_decode_graph_affix=_poco_sup100k test_lang=data/lang_poco_test_unk @@ -153,21 +152,16 @@ for dset in $unsupervised_set; do --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_sp_hires \ --scoring-opts "--min-lmwt 10 --max-lmwt 10" --determinize-opts "--word-determinize=false" \ $graphdir data/${dset}_sp_hires $chaindir/decode_${dset}_sp${decode_affix} - fi - - if [ $stage -le 6 ]; then - steps/lmrescore_const_arpa_undeterminized.sh --cmd "$decode_cmd" \ - --write-compact false --acwt 0.1 --beam 8.0 --skip-scoring true \ - $unsup_decode_lang $unsup_rescore_lang \ - data/${dset}_sp_hires \ - $chaindir/decode_${dset}_sp${decode_affix} \ - $chaindir/decode_${dset}_sp${decode_affix}_big - ln -sf ../final.mdl $chaindir/decode_${dset}_sp${decode_affix}_big/ || true + ln -sf ../final.mdl $chaindir/decode_${dset}_sp${decode_affix}/ || true fi done -decode_affix=${decode_affix}_big +frame_subsampling_factor=1 +if [ -f $chaindir/frame_subsampling_factor ]; then + frame_subsampling_factor=`cat $chaindir/frame_subsampling_factor` +fi + if [ $stage -le 8 ]; then steps/best_path_weights.sh --cmd "${train_cmd}" --acwt 0.1 \ data/${unsupervised_set}_sp_hires $lang \ @@ -176,10 +170,6 @@ if [ $stage -le 8 ]; then echo $frame_subsampling_factor > $chaindir/best_path_${unsupervised_set}_sp${decode_affix}/frame_subsampling_factor fi -frame_subsampling_factor=1 -if [ -f $chaindir/frame_subsampling_factor ]; then - frame_subsampling_factor=`cat $chaindir/frame_subsampling_factor` -fi cmvn_opts=`cat $chaindir/cmvn_opts` || exit 1 sup_ali_dir=$exp/tri4a diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_500k_semisupervised_conf_a.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_500k_semisupervised_conf_a.sh new file mode 100644 index 00000000000..5ac69af585f --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_500k_semisupervised_conf_a.sh @@ -0,0 +1,472 @@ +#!/bin/bash + +# Unsupervised set: train_unsup100k_500k +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 6,2 +# LM for decoding unsupervised data: 3gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +exp=exp/semisup_100k + +supervised_set=train_sup +unsupervised_set=train_unsup100k_500k +semisup_train_set= # semisup100k_500k + +tdnn_affix=7f # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +nnet3_affix= # affix for nnet3 and chain dir -- relates to i-vector used + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg=150 # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb_500k_1a # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=6,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix=bi_f +unsup_egs_opts= +apply_deriv_weights=true +use_smart_splitting=true + +do_finetuning=false + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size="150=128/300=64" +# to tune: +# frames_per_eg for unsupervised + +decode_iter= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +if $use_smart_splitting; then + comb_affix=${comb_affix:+${comb_affix}_smart} +else + comb_affix=${comb_affix:+${comb_affix}_naive} +fi + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}_sp${decode_affix}/frame_subsampling_factor +fi + +cmvn_opts=`cat $chaindir/cmvn_opts` || exit 1 + +sup_ali_dir=$exp/tri4a + +treedir=$exp/chain${nnet3_affix}/tree_${tree_affix} +if [ ! -f $treedir/final.mdl ]; then + echo "$0: $treedir/final.mdl does not exist." + exit 1 +fi + +diff $treedir/tree $chaindir/tree || { echo "$0: $treedir/tree and $chaindir/tree differ"; exit 1; } + +dir=$exp/chain${nnet3_affix}/tdnn${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} + +#if [ $stage -le 9 ]; then +# steps/subset_ali_dir.sh --cmd "$train_cmd" \ +# data/${unsupervised_set} data/${unsupervised_set}_sp_hires \ +# $chaindir/best_path_${unsupervised_set}_sp${decode_affix} \ +# $chaindir/best_path_${unsupervised_set}${decode_affix} +# echo $frame_subsampling_factor > $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +#fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}_sp${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri4a_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} + +if [ -z "$unsup_egs_dir" ]; then + [ -z $unsup_frames_per_eg ] && [ ! -z "$frames_per_eg" ] && unsup_frames_per_eg=$frames_per_eg + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + if $use_smart_splitting; then + get_egs_script=steps/nnet3/chain/get_egs_split.sh + else + get_egs_script=steps/nnet3/chain/get_egs.sh + fi + + $get_egs_script --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph${test_graph_affix} +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 $test_lang $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${test_graph_affix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_500k_semisupervised_conf_b.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_500k_semisupervised_conf_b.sh new file mode 100644 index 00000000000..d345ca5f20e --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_500k_semisupervised_conf_b.sh @@ -0,0 +1,472 @@ +#!/bin/bash + +# Unsupervised set: train_unsup100k_500k +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 6,2 +# LM for decoding unsupervised data: 3gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +exp=exp/semisup_100k + +supervised_set=train_sup +unsupervised_set=train_unsup100k_500k +semisup_train_set= # semisup100k_500k + +tdnn_affix=7d # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +nnet3_affix= # affix for nnet3 and chain dir -- relates to i-vector used + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg=150 # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb_500k_1b # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=6,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix=bi_d +unsup_egs_opts= +apply_deriv_weights=true +use_smart_splitting=true + +do_finetuning=false + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size="150=128/300=64" +# to tune: +# frames_per_eg for unsupervised + +decode_iter= + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +if $use_smart_splitting; then + comb_affix=${comb_affix:+${comb_affix}_smart} +else + comb_affix=${comb_affix:+${comb_affix}_naive} +fi + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}_sp${decode_affix}/frame_subsampling_factor +fi + +cmvn_opts=`cat $chaindir/cmvn_opts` || exit 1 + +sup_ali_dir=$exp/tri4a + +treedir=$exp/chain${nnet3_affix}/tree_${tree_affix} +if [ ! -f $treedir/final.mdl ]; then + echo "$0: $treedir/final.mdl does not exist." + exit 1 +fi + +diff $treedir/tree $chaindir/tree || { echo "$0: $treedir/tree and $chaindir/tree differ"; exit 1; } + +dir=$exp/chain${nnet3_affix}/tdnn${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} + +#if [ $stage -le 9 ]; then +# steps/subset_ali_dir.sh --cmd "$train_cmd" \ +# data/${unsupervised_set} data/${unsupervised_set}_sp_hires \ +# $chaindir/best_path_${unsupervised_set}_sp${decode_affix} \ +# $chaindir/best_path_${unsupervised_set}${decode_affix} +# echo $frame_subsampling_factor > $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +#fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}_sp${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine skip-in-init=true + output name=output-1 input=output.affine skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri4a_${supervised_set}_unk_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} + +if [ -z "$unsup_egs_dir" ]; then + [ -z $unsup_frames_per_eg ] && [ ! -z "$frames_per_eg" ] && unsup_frames_per_eg=$frames_per_eg + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + if $use_smart_splitting; then + get_egs_script=steps/nnet3/chain/get_egs_split.sh + else + get_egs_script=steps/nnet3/chain/get_egs.sh + fi + + $get_egs_script --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph${test_graph_affix} +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 $test_lang $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode${test_graph_affix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_c_oracle.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_c_oracle.sh index 696b26d1d5a..d14aa752c14 100755 --- a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_c_oracle.sh +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_c_oracle.sh @@ -13,7 +13,6 @@ decode_iter= supervised_set=train_sup unsupervised_set=train_unsup100k_250k_n10k base_train_set=train_oracle100k_250k_n10k -ivector_train_set=train_sup tree_affix=bi_a nnet3_affix= chain_affix= @@ -54,18 +53,27 @@ treedir=$exp/chain${chain_affix}/tree_${tree_affix} lat_dir=$exp/chain${chain_affix}/$(basename $gmm_dir)_${base_train_set}_sp_lats # training lattices directory dir=$exp/chain${chain_affix}/tdnn${tdnn_affix}_sp train_data_dir=data/${base_train_set}_sp_hires -train_ivector_dir=$exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_sp_hires +train_ivector_dir=$exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires lang=data/lang_chain # The iVector-extraction and feature-dumping parts are the same as the standard # nnet3 setup, and you can skip them by setting "--stage 8" if you have already # run those things. -local/nnet3/run_ivector_common_pca.sh --stage $stage --exp $exp \ - --speed-perturb true \ - --train-set $supervised_set \ - --ivector-train-set $supervised_set \ - --nnet3-affix "$nnet3_affix" || exit 1 +#local/nnet3/run_ivector_common_pca.sh --stage $stage --exp $exp \ +# --speed-perturb true \ +# --train-set $supervised_set \ +# --ivector-train-set $supervised_set \ +# --nnet3-affix "$nnet3_affix" || exit 1 + +if [ $stage -le 8 ]; then + steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2 \ + data/${base_train_set}_sp_hires data/${base_train_set}_sp_max2_hires + + steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \ + data/${base_train_set}_sp_max2_hires $exp/nnet3${nnet3_affix}/extractor \ + $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires || exit 1 +fi if [ $stage -le 9 ]; then # Get the alignments as lattices (gives the chain training more freedom). diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_d.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_d.sh index 4965e0d4dcb..569d4d0604e 100755 --- a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_d.sh +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_d.sh @@ -68,6 +68,8 @@ if [ $stage -le 9 ]; then rm $lat_dir/fsts.*.gz # save space fi +exit 1 + if [ $stage -le 10 ]; then # Create a version of the lang/ directory that has one state per phone in the # topo file. [note, it really has two states.. the first one is only repeated diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_f.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_f.sh index dca562e7c20..b6aa3520d3b 100755 --- a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_f.sh +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_f.sh @@ -12,7 +12,7 @@ get_egs_stage=-10 decode_iter= train_set=train_sup ivector_train_set=train_sup -tree_affix=bi_d +tree_affix=bi_f nnet3_affix= chain_affix= exp=exp/semisup_100k diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_250k_semisupervised_conf_a.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_250k_semisupervised_conf_a.sh new file mode 100644 index 00000000000..ebf52fa8b40 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_250k_semisupervised_conf_a.sh @@ -0,0 +1,513 @@ +#!/bin/bash + +# Unsupervised set: train_unsup100k_250k +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 3,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +exp=exp/semisup_100k + +supervised_set=train_sup +unsupervised_set=train_unsup100k_250k +semisup_train_set= # semisup100k_250k + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" +tree_affix=bi_a + +nnet3_affix= # affix for nnet3 and chain dir -- relates to i-vector used + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg=150 # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb_250k_1a # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=3,2 +sup_egs_dir= +unsup_egs_dir= +unsup_egs_opts= + +remove_egs=false +common_egs_dir= + +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +apply_deriv_weights=true +use_smart_splitting=true + +# training options +num_epochs=4 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +self_repair_scale=0.00001 +label_delay=5 + +# decode options +extra_left_context=50 +extra_right_context=0 + +decode_iter= + +do_finetuning=false + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +if $use_smart_splitting; then + comb_affix=${comb_affix:+${comb_affix}_smart} +else + comb_affix=${comb_affix:+${comb_affix}_naive} +fi + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}_sp${decode_affix}/frame_subsampling_factor +fi + +cmvn_opts=`cat $chaindir/cmvn_opts` || exit 1 + +sup_ali_dir=$exp/tri4a + +treedir=$exp/chain${nnet3_affix}/tree_${tree_affix} +if [ ! -f $treedir/final.mdl ]; then + echo "$0: $treedir/final.mdl does not exist." + exit 1 +fi + +diff $treedir/tree $chaindir/tree || { echo "$0: $treedir/tree and $chaindir/tree differ"; exit 1; } + +dir=$exp/chain${nnet3_affix}/tdnn_lstm${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} + +#if [ $stage -le 9 ]; then +# steps/subset_ali_dir.sh --cmd "$train_cmd" \ +# data/${unsupervised_set} data/${unsupervised_set}_sp_hires \ +# $chaindir/best_path_${unsupervised_set}_sp${decode_affix} \ +# $chaindir/best_path_${unsupervised_set}${decode_affix} +# echo $frame_subsampling_factor > $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +#fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}_sp${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + output-layer name=output input=lstm3 output-delay=$label_delay dim=$num_targets include-log-softmax=false max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine@$label_delay skip-in-init=true + output name=output-1 input=output.affine@$label_delay skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax@$label_delay skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax@$label_delay skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + chunk_left_context] +right_context=$[model_right_context + chunk_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +egs_left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +egs_right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +egs_left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +egs_right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri4a_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} + +if [ -z "$unsup_egs_dir" ]; then + [ -z $unsup_frames_per_eg ] && [ ! -z "$frames_per_eg" ] && unsup_frames_per_eg=$frames_per_eg + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + if $use_smart_splitting; then + get_egs_script=steps/nnet3/chain/get_egs_split.sh + else + get_egs_script=steps/nnet3/chain/get_egs.sh + fi + + $get_egs_script --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 64 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph${test_graph_affix} +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 $test_lang $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --frames-per-chunk 160 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + $graph_dir data/${decode_set}_hires $dir/decode${test_graph_affix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --frames-per-chunk 150 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_250k_semisupervised_conf_c.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_250k_semisupervised_conf_c.sh new file mode 100644 index 00000000000..cca77616936 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_250k_semisupervised_conf_c.sh @@ -0,0 +1,516 @@ +#!/bin/bash + +# Unsupervised set: train_unsup100k_250k +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 3,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +exp=exp/semisup_100k + +supervised_set=train_sup +unsupervised_set=train_unsup100k_250k +semisup_train_set= # semisup100k_250k + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" +tree_affix=bi_a + +nnet3_affix= # affix for nnet3 and chain dir -- relates to i-vector used + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg=150 # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb_250k_1c # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=3,2 +sup_egs_dir= +unsup_egs_dir= +unsup_egs_opts= + +remove_egs=false +common_egs_dir= + +hidden_dim=768 +cell_dim=768 +projection_dim=192 + +apply_deriv_weights=true +use_smart_splitting=true + +# training options +num_epochs=4 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +self_repair_scale=0.00001 +label_delay=5 + +# decode options +extra_left_context=50 +extra_right_context=0 + +decode_iter= + +do_finetuning=false + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +if $use_smart_splitting; then + comb_affix=${comb_affix:+${comb_affix}_smart} +else + comb_affix=${comb_affix:+${comb_affix}_naive} +fi + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}_sp${decode_affix}/frame_subsampling_factor +fi + +cmvn_opts=`cat $chaindir/cmvn_opts` || exit 1 + +sup_ali_dir=$exp/tri4a + +treedir=$exp/chain${nnet3_affix}/tree_${tree_affix} +if [ ! -f $treedir/final.mdl ]; then + echo "$0: $treedir/final.mdl does not exist." + exit 1 +fi + +diff $treedir/tree $chaindir/tree || { echo "$0: $treedir/tree and $chaindir/tree differ"; exit 1; } + +dir=$exp/chain${nnet3_affix}/tdnn_lstm${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} + +#if [ $stage -le 9 ]; then +# steps/subset_ali_dir.sh --cmd "$train_cmd" \ +# data/${unsupervised_set} data/${unsupervised_set}_sp_hires \ +# $chaindir/best_path_${unsupervised_set}_sp${decode_affix} \ +# $chaindir/best_path_${unsupervised_set}${decode_affix} +# echo $frame_subsampling_factor > $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +#fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}_sp${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm4 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + output-layer name=output input=lstm4 output-delay=$label_delay dim=$num_targets include-log-softmax=false max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine@$label_delay skip-in-init=true + output name=output-1 input=output.affine@$label_delay skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax@$label_delay skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax@$label_delay skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + chunk_left_context] +right_context=$[model_right_context + chunk_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +egs_left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +egs_right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +egs_left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +egs_right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri4a_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} + +if [ -z "$unsup_egs_dir" ]; then + [ -z $unsup_frames_per_eg ] && [ ! -z "$frames_per_eg" ] && unsup_frames_per_eg=$frames_per_eg + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + if $use_smart_splitting; then + get_egs_script=steps/nnet3/chain/get_egs_split.sh + else + get_egs_script=steps/nnet3/chain/get_egs.sh + fi + + $get_egs_script --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 64 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph${test_graph_affix} +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 $test_lang $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --frames-per-chunk 160 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + $graph_dir data/${decode_set}_hires $dir/decode${test_graph_affix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --frames-per-chunk 150 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_500k_semisupervised_conf_a.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_500k_semisupervised_conf_a.sh new file mode 100644 index 00000000000..72b3ef0cb25 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_500k_semisupervised_conf_a.sh @@ -0,0 +1,518 @@ +#!/bin/bash + +# Unsupervised set: train_unsup100k_500k +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 3,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +exp=exp/semisup_100k + +supervised_set=train_sup +unsupervised_set=train_unsup100k_500k +semisup_train_set= # semisup100k_500k + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" +tree_affix=bi_a + +nnet3_affix= # affix for nnet3 and chain dir -- relates to i-vector used + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg=150 # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb_500k_1a # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +num_copies= +lm_weights=3,1 +sup_egs_dir= +unsup_egs_dir= +unsup_egs_opts= + +remove_egs=false +common_egs_dir= + +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +apply_deriv_weights=true +use_smart_splitting=true + +# training options +num_epochs=4 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +self_repair_scale=0.00001 +label_delay=5 + +# decode options +extra_left_context=50 +extra_right_context=0 + +decode_iter= + +do_finetuning=false + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +if $use_smart_splitting; then + comb_affix=${comb_affix:+${comb_affix}_smart} +else + comb_affix=${comb_affix:+${comb_affix}_naive} +fi + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}_sp${decode_affix}/frame_subsampling_factor +fi + +cmvn_opts=`cat $chaindir/cmvn_opts` || exit 1 + +sup_ali_dir=$exp/tri4a + +treedir=$exp/chain${nnet3_affix}/tree_${tree_affix} +if [ ! -f $treedir/final.mdl ]; then + echo "$0: $treedir/final.mdl does not exist." + exit 1 +fi + +diff $treedir/tree $chaindir/tree || { echo "$0: $treedir/tree and $chaindir/tree differ"; exit 1; } + +dir=$exp/chain${nnet3_affix}/tdnn_lstm${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} + +#if [ $stage -le 9 ]; then +# steps/subset_ali_dir.sh --cmd "$train_cmd" \ +# data/${unsupervised_set} data/${unsupervised_set}_sp_hires \ +# $chaindir/best_path_${unsupervised_set}_sp${decode_affix} \ +# $chaindir/best_path_${unsupervised_set}${decode_affix} +# echo $frame_subsampling_factor > $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +#fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}_sp${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=dnn2 dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=dnn4 dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=dnn6 dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + output-layer name=output input=lstm3 output-delay=$label_delay dim=$num_targets include-log-softmax=false learning-rate-factor=$learning_rate_factor max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine@$label_delay skip-in-init=true + output name=output-1 input=output.affine@$label_delay skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax@$label_delay skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax@$label_delay skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + chunk_left_context] +right_context=$[model_right_context + chunk_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +egs_left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +egs_right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +egs_left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +egs_right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri4a_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} + +if [ -z "$unsup_egs_dir" ]; then + [ -z $unsup_frames_per_eg ] && [ ! -z "$frames_per_eg" ] && unsup_frames_per_eg=$frames_per_eg + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + if $use_smart_splitting; then + get_egs_script=steps/nnet3/chain/get_egs_split.sh + else + get_egs_script=steps/nnet3/chain/get_egs.sh + fi + + $get_egs_script --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 64 --frames-per-iter 1500000 \ + --lang2weight "$supervision_weights" \ + --lang2num-copies "$num_copies" --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph${test_graph_affix} +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 $test_lang $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --frames-per-chunk 160 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + $graph_dir data/${decode_set}_hires $dir/decode${test_graph_affix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --frames-per-chunk 150 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_500k_semisupervised_conf_b.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_500k_semisupervised_conf_b.sh new file mode 100644 index 00000000000..905309bc5c7 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_500k_semisupervised_conf_b.sh @@ -0,0 +1,517 @@ +#!/bin/bash + +# Unsupervised set: train_unsup100k_500k +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 3,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +exp=exp/semisup_100k + +supervised_set=train_sup +unsupervised_set=train_unsup100k_500k +semisup_train_set= # semisup100k_500k + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" +tree_affix=bi_a + +nnet3_affix= # affix for nnet3 and chain dir -- relates to i-vector used + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg=150 # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb_500k_1b # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +num_copies= +lm_weights=3,1 +sup_egs_dir= +unsup_egs_dir= +unsup_egs_opts= + +remove_egs=false +common_egs_dir= + +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +apply_deriv_weights=true +use_smart_splitting=true + +# training options +num_epochs=2 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +self_repair_scale=0.00001 +label_delay=5 + +# decode options +extra_left_context=50 +extra_right_context=0 + +decode_iter= + +do_finetuning=false + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +if $use_smart_splitting; then + comb_affix=${comb_affix:+${comb_affix}_smart} +else + comb_affix=${comb_affix:+${comb_affix}_naive} +fi + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}_sp${decode_affix}/frame_subsampling_factor +fi + +cmvn_opts=`cat $chaindir/cmvn_opts` || exit 1 + +sup_ali_dir=$exp/tri4a + +treedir=$exp/chain${nnet3_affix}/tree_${tree_affix} +if [ ! -f $treedir/final.mdl ]; then + echo "$0: $treedir/final.mdl does not exist." + exit 1 +fi + +diff $treedir/tree $chaindir/tree || { echo "$0: $treedir/tree and $chaindir/tree differ"; exit 1; } + +dir=$exp/chain${nnet3_affix}/tdnn_lstm${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} + +#if [ $stage -le 9 ]; then +# steps/subset_ali_dir.sh --cmd "$train_cmd" \ +# data/${unsupervised_set} data/${unsupervised_set}_sp_hires \ +# $chaindir/best_path_${unsupervised_set}_sp${decode_affix} \ +# $chaindir/best_path_${unsupervised_set}${decode_affix} +# echo $frame_subsampling_factor > $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +#fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}_sp${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm4 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + output-layer name=output input=lstm4 output-delay=$label_delay dim=$num_targets include-log-softmax=false learning-rate-factor=$learning_rate_factor max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine@$label_delay skip-in-init=true + output name=output-1 input=output.affine@$label_delay skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax@$label_delay skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax@$label_delay skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + chunk_left_context] +right_context=$[model_right_context + chunk_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +egs_left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +egs_right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +egs_left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +egs_right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri4a_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} + +if [ -z "$unsup_egs_dir" ]; then + [ -z $unsup_frames_per_eg ] && [ ! -z "$frames_per_eg" ] && unsup_frames_per_eg=$frames_per_eg + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + if $use_smart_splitting; then + get_egs_script=steps/nnet3/chain/get_egs_split.sh + else + get_egs_script=steps/nnet3/chain/get_egs.sh + fi + + $get_egs_script --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 64 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --lang2num-copies "$num_copies" --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph${test_graph_affix} +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 $test_lang $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --frames-per-chunk 160 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + $graph_dir data/${decode_set}_hires $dir/decode${test_graph_affix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --frames-per-chunk 150 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_500k_semisupervised_conf_c.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_500k_semisupervised_conf_c.sh new file mode 100644 index 00000000000..1e8ce3039a6 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_500k_semisupervised_conf_c.sh @@ -0,0 +1,513 @@ +#!/bin/bash + +# Unsupervised set: train_unsup100k_500k +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 3,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +exp=exp/semisup_100k + +supervised_set=train_sup +unsupervised_set=train_unsup100k_500k +semisup_train_set= # semisup100k_500k + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" +tree_affix=bi_a + +nnet3_affix= # affix for nnet3 and chain dir -- relates to i-vector used + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg=150 # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb_500k_1c # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=3,1 +sup_egs_dir= +unsup_egs_dir= +unsup_egs_opts= + +remove_egs=false +common_egs_dir= + +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +apply_deriv_weights=true +use_smart_splitting=true + +# training options +num_epochs=4 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +self_repair_scale=0.00001 +label_delay=5 + +# decode options +extra_left_context=50 +extra_right_context=0 + +decode_iter= + +do_finetuning=false + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +if $use_smart_splitting; then + comb_affix=${comb_affix:+${comb_affix}_smart} +else + comb_affix=${comb_affix:+${comb_affix}_naive} +fi + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}_sp${decode_affix}/frame_subsampling_factor +fi + +cmvn_opts=`cat $chaindir/cmvn_opts` || exit 1 + +sup_ali_dir=$exp/tri4a + +treedir=$exp/chain${nnet3_affix}/tree_${tree_affix} +if [ ! -f $treedir/final.mdl ]; then + echo "$0: $treedir/final.mdl does not exist." + exit 1 +fi + +diff $treedir/tree $chaindir/tree || { echo "$0: $treedir/tree and $chaindir/tree differ"; exit 1; } + +dir=$exp/chain${nnet3_affix}/tdnn_lstm${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} + +#if [ $stage -le 9 ]; then +# steps/subset_ali_dir.sh --cmd "$train_cmd" \ +# data/${unsupervised_set} data/${unsupervised_set}_sp_hires \ +# $chaindir/best_path_${unsupervised_set}_sp${decode_affix} \ +# $chaindir/best_path_${unsupervised_set}${decode_affix} +# echo $frame_subsampling_factor > $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +#fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}_sp${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + output-layer name=output input=lstm3 output-delay=$label_delay dim=$num_targets include-log-softmax=false learning-rate-factor=$learning_rate_factor max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine@$label_delay skip-in-init=true + output name=output-1 input=output.affine@$label_delay skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax@$label_delay skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax@$label_delay skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + chunk_left_context] +right_context=$[model_right_context + chunk_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +egs_left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +egs_right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +egs_left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +egs_right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri4a_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} + +if [ -z "$unsup_egs_dir" ]; then + [ -z $unsup_frames_per_eg ] && [ ! -z "$frames_per_eg" ] && unsup_frames_per_eg=$frames_per_eg + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + if $use_smart_splitting; then + get_egs_script=steps/nnet3/chain/get_egs_split.sh + else + get_egs_script=steps/nnet3/chain/get_egs.sh + fi + + $get_egs_script --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 64 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph${test_graph_affix} +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 $test_lang $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --frames-per-chunk 160 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + $graph_dir data/${decode_set}_hires $dir/decode${test_graph_affix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --frames-per-chunk 150 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_b.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_b.sh new file mode 100755 index 00000000000..e686d977ded --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_b.sh @@ -0,0 +1,225 @@ +#!/bin/bash +set -e + +# This is fisher chain recipe for training a model on a subset of around 15 hours. + +# configs for 'chain' +stage=0 +tdnn_affix=7b +train_stage=-10 +get_egs_stage=-10 +decode_iter= +train_set=train_sup +ivector_train_set=train_sup +tree_affix=bi_a +nnet3_affix= +chain_affix= +exp=exp/semisup_100k +gmm=tri4a +hidden_dim=512 +cell_dim=512 +projection_dim=128 + +# training options +num_epochs=4 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +label_delay=5 + +# decode options +extra_left_context=50 +extra_right_context=0 + +remove_egs=false +common_egs_dir= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate -1 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 7000 data/${train_set}_sp $lang $lat_dir $treedir || exit 1 +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm3 output-delay=$label_delay dim=$num_targets include-log-softmax=false max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + mkdir -p $dir/egs + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true" \ + --egs.chunk-width 160,140,110,80 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph_poco +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_poco_test $dir $graph_dir +fi + +decode_suff= +if [ $stage -le 15 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk 160 \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}${decode_suff} || exit 1; + ) & + done +fi +wait; +exit 0; diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_c.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_c.sh new file mode 100755 index 00000000000..1854a4a86e1 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_c.sh @@ -0,0 +1,225 @@ +#!/bin/bash +set -e + +# This is fisher chain recipe for training a model on a subset of around 15 hours. + +# configs for 'chain' +stage=0 +tdnn_affix=7c +train_stage=-10 +get_egs_stage=-10 +decode_iter= +train_set=train_sup +ivector_train_set=train_sup +tree_affix=bi_c +nnet3_affix= +chain_affix= +exp=exp/semisup_100k +gmm=tri4a +hidden_dim=512 +cell_dim=512 +projection_dim=128 + +# training options +num_epochs=4 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +label_delay=5 + +# decode options +extra_left_context=50 +extra_right_context=0 + +remove_egs=false +common_egs_dir= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate -1 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 7000 data/${train_set}_sp $lang $lat_dir $treedir || exit 1 +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + mkdir -p $dir/egs + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true" \ + --egs.chunk-width 160,140,110,80 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph_poco_unk +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_poco_test_unk $dir $graph_dir +fi + +decode_suff= +if [ $stage -le 15 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk 160 \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}${decode_suff} || exit 1; + ) & + done +fi +wait; +exit 0; diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_oracle_b.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_oracle_b.sh new file mode 100755 index 00000000000..988299a4621 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_oracle_b.sh @@ -0,0 +1,244 @@ +#!/bin/bash +set -e +set -u + +# This is oracle experiment for semi-supervised training with 100 hours +# of supervised data and 250 hours of unsupervised data + +# configs for 'chain' +stage=0 +tdnn_affix=7b_oracle +train_stage=-10 +get_egs_stage=-10 +decode_iter= +supervised_set=train_sup +unsupervised_set=train_unsup100k_250k_n10k +base_train_set=train_oracle100k_250k_n10k +tree_affix=bi_a +nnet3_affix= +chain_affix= +exp=exp/semisup_100k +gmm=tri4a + +hidden_dim=512 +cell_dim=512 +projection_dim=128 + +# training options +num_epochs=4 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +self_repair_scale=0.00001 +label_delay=5 + +# decode options +extra_left_context=50 +extra_right_context=0 + +remove_egs=false +common_egs_dir= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ ! -f $treedir/final.mdl ]; then + echo "$0: Could not find $treedir/final.mdl" + exit 1 +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + mkdir -p $dir/egs + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true" \ + --egs.chunk-width 160,140,110,80 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph_poco +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_poco_test $dir $graph_dir +fi + +decode_suff= +if [ $stage -le 15 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk 160 \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}${decode_suff} || exit 1; + ) & + done +fi +wait; +exit 0; diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_oracle_c.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_oracle_c.sh new file mode 100755 index 00000000000..b21dd72a37a --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_oracle_c.sh @@ -0,0 +1,247 @@ +#!/bin/bash +set -e +set -u + +# This is oracle experiment for semi-supervised training with 100 hours +# of supervised data and 250 hours of unsupervised data + +# configs for 'chain' +stage=0 +tdnn_affix=7c_oracle +train_stage=-10 +get_egs_stage=-10 +decode_iter= +supervised_set=train_sup +unsupervised_set=train_unsup100k_250k_n10k +base_train_set=train_oracle100k_250k_n10k +tree_affix=bi_a +nnet3_affix= +chain_affix= +exp=exp/semisup_100k +gmm=tri4a + +hidden_dim=512 +cell_dim=512 +projection_dim=128 + +# training options +num_epochs=4 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +self_repair_scale=0.00001 +label_delay=5 + +# decode options +extra_left_context=50 +extra_right_context=0 + +remove_egs=false +common_egs_dir= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ ! -f $treedir/final.mdl ]; then + echo "$0: Could not find $treedir/final.mdl" + exit 1 +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=dnn2 dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=dnn4 dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=dnn6 dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + mkdir -p $dir/egs + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true" \ + --egs.chunk-width 160,140,110,80 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph_poco +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_poco_test $dir $graph_dir +fi + +decode_suff= +if [ $stage -le 15 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk 160 \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}${decode_suff} || exit 1; + ) & + done +fi +wait; +exit 0; diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_smbr_a.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_smbr_a.sh new file mode 100755 index 00000000000..5171890b981 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_smbr_a.sh @@ -0,0 +1,228 @@ +#!/bin/bash +set -e + +# This is fisher chain recipe for training a model on a subset of around 15 hours. + +# configs for 'chain' +stage=0 +tdnn_affix=7smbr_a +train_stage=-10 +get_egs_stage=-10 +decode_iter= +train_set=train_sup +ivector_train_set=train_sup +tree_affix=bi_c +nnet3_affix= +chain_affix= +exp=exp/semisup_100k +gmm=tri4a +hidden_dim=512 +cell_dim=512 +projection_dim=128 + +# training options +num_epochs=4 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +label_delay=5 +extra_opts="--chain.mmi-factor-schedule=1.0,1.0@0.1,0.5@0.2,0.5 --chain.smbr-factor-schedule=0.0,0.0@0.1,0.5@0.2,0.5" +chain_smbr_extra_opts= + +# decode options +extra_left_context=50 +extra_right_context=0 + +remove_egs=false +common_egs_dir= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate -1 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 7000 data/${train_set}_sp $lang $lat_dir $treedir || exit 1 +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + mkdir -p $dir/egs + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.0 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true" \ + --egs.chunk-width 160,140,110,80 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --chain.smbr-extra-opts="$chain_smbr_extra_opts" \ + --dir $dir --lang $lang $extra_opts || exit 1; +fi + +graph_dir=$dir/graph_poco_unk +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_poco_test_unk $dir $graph_dir +fi + +decode_suff= +if [ $stage -le 15 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk 160 \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}${decode_suff} || exit 1; + ) & + done +fi +wait; +exit 0; diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_15k_a.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_15k_a.sh new file mode 100755 index 00000000000..bf1e4878c8e --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_15k_a.sh @@ -0,0 +1,232 @@ +#!/bin/bash +set -e + +# This is fisher chain recipe for training a model on a subset of around 15 hours. +# This is similar to _d, but uses a phone LM UNK model + +# configs for 'chain' +stage=0 +tdnn_affix=7a +train_stage=-10 +get_egs_stage=-10 +decode_iter= +train_set=train_sup15k +unsupervised_set=train_unsup100_250k +semisup_train_set=semisup15k_250k +tree_affix=bi_i +nnet3_affix=_semi15k_250k +chain_affix=_semi15k_250k +exp=exp/semisup_15k +gmm=tri3 +hidden_dim=512 +cell_dim=512 +projection_dim=128 + +# training options +num_epochs=10 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +self_repair_scale=0.00001 +label_delay=5 + +# decode options +extra_left_context=50 +extra_right_context=0 + +remove_egs=false +common_egs_dir= + +# End configuration section. + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate -1 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 7000 data/${train_set}_sp $lang $lat_dir $treedir || exit 1 +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm3 output-delay=$label_delay dim=$num_targets include-log-softmax=false max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + mkdir -p $dir/egs + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true" \ + --egs.chunk-width 160,140,110,80 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph_poco_unk +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_poco_test_unk $dir $graph_dir +fi + +decode_suff= +if [ $stage -le 15 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk 160 \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}${decode_suff} || exit 1; + ) & + done +fi +wait; +exit 0; diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_15k_b.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_15k_b.sh new file mode 100755 index 00000000000..3c9ab27a353 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_15k_b.sh @@ -0,0 +1,232 @@ +#!/bin/bash +set -e + +# This is fisher chain recipe for training a model on a subset of around 15 hours. +# This is similar to _d, but uses a phone LM UNK model + +# configs for 'chain' +stage=0 +tdnn_affix=7b +train_stage=-10 +get_egs_stage=-10 +decode_iter= +train_set=train_sup15k +unsupervised_set=train_unsup100_250k +semisup_train_set=semisup15k_250k +tree_affix=bi_j +nnet3_affix=_semi15k_250k +chain_affix=_semi15k_250k +exp=exp/semisup_15k +gmm=tri3 +hidden_dim=512 +cell_dim=512 +projection_dim=128 + +# training options +num_epochs=10 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +self_repair_scale=0.00001 +label_delay=5 + +# decode options +extra_left_context=50 +extra_right_context=0 + +remove_egs=false +common_egs_dir= + +# End configuration section. + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate -1 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 7000 data/${train_set}_sp $lang $lat_dir $treedir || exit 1 +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm3 output-delay=$label_delay dim=$num_targets include-log-softmax=false max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + mkdir -p $dir/egs + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true" \ + --egs.chunk-width 160,140,110,80 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph_poco +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_poco_test $dir $graph_dir +fi + +decode_suff= +if [ $stage -le 15 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk 160 \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}${decode_suff} || exit 1; + ) & + done +fi +wait; +exit 0; diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_15k_oracle_a.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_15k_oracle_a.sh new file mode 100755 index 00000000000..997a17a5329 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_15k_oracle_a.sh @@ -0,0 +1,244 @@ +#!/bin/bash +set -e +set -u + +# This is oracle experiment for semi-supervised training with 100 hours +# of supervised data and 250 hours of unsupervised data + +# configs for 'chain' +stage=0 +tdnn_affix=7a_oracle +train_stage=-10 +get_egs_stage=-10 +decode_iter= +supervised_set=train_sup +unsupervised_set=train_unsup100k_250k_n10k +base_train_set=train_oracle100k_250k_n10k +tree_affix=bi_i +nnet3_affix=_semi15k_250k +chain_affix=_semi15k_250k +exp=exp/semisup_15k +gmm=tri3 + +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +# training options +num_epochs=4 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +self_repair_scale=0.00001 +label_delay=5 + +# decode options +extra_left_context=50 +extra_right_context=0 + +remove_egs=false +common_egs_dir= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ ! -f $treedir/final.mdl ]; then + echo "$0: Could not find $treedir/final.mdl" + exit 1 +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + mkdir -p $dir/egs + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true" \ + --egs.chunk-width 160,140,110,80 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph_poco +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_poco_test $dir $graph_dir +fi + +decode_suff= +if [ $stage -le 15 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk 160 \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}${decode_suff} || exit 1; + ) & + done +fi +wait; +exit 0; diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_15k_semisupervised_conf_a.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_15k_semisupervised_conf_a.sh new file mode 100644 index 00000000000..145b4c0e178 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_15k_semisupervised_conf_a.sh @@ -0,0 +1,507 @@ +#!/bin/bash + +# Unsupervised set: train_unsup100k_250k +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 3,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +exp=exp/semisup_15k + +unsupervised_set=train_unsup100k_250k # set this to your choice of unsupervised data +supervised_set=train_sup15k +semisup_train_set=semisup15k_250k + +tdnn_affix=7a # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" +tree_affix=bi_i + +nnet3_affix=_semi15k_250k # affix for nnet3 and chain dir -- relates to i-vector used +chain_affix=_semi15k_250k + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg=150 # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1a # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +unsup_egs_opts= + +remove_egs=false +common_egs_dir= + +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +apply_deriv_weights=true +use_smart_splitting=true + +# training options +num_epochs=2 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +self_repair_scale=0.00001 +label_delay=5 + +# decode options +extra_left_context=50 +extra_right_context=0 + +decode_iter= + +do_finetuning=false + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +if $use_smart_splitting; then + comb_affix=${comb_affix:+${comb_affix}_smart} +else + comb_affix=${comb_affix:+${comb_affix}_naive} +fi + +RANDOM=0 + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + output-layer name=output input=lstm3 output-delay=$label_delay dim=$num_targets include-log-softmax=false max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine@$label_delay skip-in-init=true + output name=output-1 input=output.affine@$label_delay skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax@$label_delay skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax@$label_delay skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + chunk_left_context] +right_context=$[model_right_context + chunk_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +egs_left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +egs_right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +egs_left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +egs_right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${chain_affix}/tri3_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${semisup_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} + +if [ -z "$unsup_egs_dir" ]; then + [ -z $unsup_frames_per_eg ] && [ ! -z "$frames_per_eg" ] && unsup_frames_per_eg=$frames_per_eg + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + if $use_smart_splitting; then + get_egs_script=steps/nnet3/chain/get_egs_split.sh + else + get_egs_script=steps/nnet3/chain/get_egs.sh + fi + + $get_egs_script --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${semisup_train_set}_sp_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 64 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${semisup_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs 4 \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph${test_graph_affix} +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 $test_lang $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --frames-per-chunk 160 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + $graph_dir data/${decode_set}_hires $dir/decode${test_graph_affix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --frames-per-chunk 150 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_15k_semisupervised_conf_b.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_15k_semisupervised_conf_b.sh new file mode 100644 index 00000000000..e9749bd7676 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_15k_semisupervised_conf_b.sh @@ -0,0 +1,509 @@ +#!/bin/bash + +# This script uses phone LM to model UNK. + +# Unsupervised set: train_unsup100k_250k +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 3,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +exp=exp/semisup_15k + +unsupervised_set=train_unsup100k_250k # set this to your choice of unsupervised data +supervised_set=train_sup15k +semisup_train_set=semisup15k_250k + +tdnn_affix=7a # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" +tree_affix=bi_i + +nnet3_affix=_semi15k_250k # affix for nnet3 and chain dir -- relates to i-vector used +chain_affix=_semi15k_250k + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg=150 # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1b # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +unsup_egs_opts= + +remove_egs=false +common_egs_dir= + +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +apply_deriv_weights=true +use_smart_splitting=true + +# training options +num_epochs=2 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +self_repair_scale=0.00001 +label_delay=5 + +# decode options +extra_left_context=50 +extra_right_context=0 + +decode_iter= + +do_finetuning=false + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +if $use_smart_splitting; then + comb_affix=${comb_affix:+${comb_affix}_smart} +else + comb_affix=${comb_affix:+${comb_affix}_naive} +fi + +RANDOM=0 + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + output-layer name=output input=lstm3 output-delay=$label_delay dim=$num_targets include-log-softmax=false max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine@$label_delay skip-in-init=true + output name=output-1 input=output.affine@$label_delay skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax@$label_delay skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax@$label_delay skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + chunk_left_context] +right_context=$[model_right_context + chunk_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +egs_left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +egs_right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +egs_left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +egs_right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${chain_affix}/tri3_${supervised_set}_unk_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${semisup_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} + +if [ -z "$unsup_egs_dir" ]; then + [ -z $unsup_frames_per_eg ] && [ ! -z "$frames_per_eg" ] && unsup_frames_per_eg=$frames_per_eg + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + if $use_smart_splitting; then + get_egs_script=steps/nnet3/chain/get_egs_split.sh + else + get_egs_script=steps/nnet3/chain/get_egs.sh + fi + + $get_egs_script --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${semisup_train_set}_sp_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 64 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${semisup_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs 4 \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph${test_graph_affix} +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 $test_lang $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --frames-per-chunk 160 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + $graph_dir data/${decode_set}_hires $dir/decode${test_graph_affix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --frames-per-chunk 150 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_15k_semisupervised_conf_c.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_15k_semisupervised_conf_c.sh new file mode 100644 index 00000000000..01c0191be83 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_15k_semisupervised_conf_c.sh @@ -0,0 +1,507 @@ +#!/bin/bash + +# Unsupervised set: train_unsup100k_250k +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 3,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +exp=exp/semisup_15k + +unsupervised_set=train_unsup100k_250k # set this to your choice of unsupervised data +supervised_set=train_sup15k +semisup_train_set=semisup15k_250k + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" +tree_affix=bi_j + +nnet3_affix=_semi15k_250k # affix for nnet3 and chain dir -- relates to i-vector used +chain_affix=_semi15k_250k + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg=150 # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1c # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +unsup_egs_opts= + +remove_egs=false +common_egs_dir= + +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +apply_deriv_weights=true +use_smart_splitting=true + +# training options +num_epochs=2 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +self_repair_scale=0.00001 +label_delay=5 + +# decode options +extra_left_context=50 +extra_right_context=0 + +decode_iter= + +do_finetuning=false + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +if $use_smart_splitting; then + comb_affix=${comb_affix:+${comb_affix}_smart} +else + comb_affix=${comb_affix:+${comb_affix}_naive} +fi + +RANDOM=0 + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + output-layer name=output input=lstm3 output-delay=$label_delay dim=$num_targets include-log-softmax=false max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine@$label_delay skip-in-init=true + output name=output-1 input=output.affine@$label_delay skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax@$label_delay skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax@$label_delay skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + chunk_left_context] +right_context=$[model_right_context + chunk_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +egs_left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +egs_right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +egs_left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +egs_right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${chain_affix}/tri3_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${semisup_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} + +if [ -z "$unsup_egs_dir" ]; then + [ -z $unsup_frames_per_eg ] && [ ! -z "$frames_per_eg" ] && unsup_frames_per_eg=$frames_per_eg + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + if $use_smart_splitting; then + get_egs_script=steps/nnet3/chain/get_egs_split.sh + else + get_egs_script=steps/nnet3/chain/get_egs.sh + fi + + $get_egs_script --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${semisup_train_set}_sp_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 64 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${semisup_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs 4 \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph${test_graph_affix} +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 $test_lang $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --frames-per-chunk 160 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + $graph_dir data/${decode_set}_hires $dir/decode${test_graph_affix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --frames-per-chunk 150 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_15k_smbr_a.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_15k_smbr_a.sh new file mode 100755 index 00000000000..aff735560e0 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_15k_smbr_a.sh @@ -0,0 +1,232 @@ +#!/bin/bash +set -e + +# This is fisher chain recipe for training a model on a subset of around 15 hours. +# This is similar to _d, but uses a phone LM UNK model + +# configs for 'chain' +stage=0 +tdnn_affix=7smbr_a +train_stage=-10 +get_egs_stage=-10 +decode_iter= +train_set=train_sup15k +unsupervised_set=train_unsup100_250k +semisup_train_set=semisup15k_250k +tree_affix=bi_i +nnet3_affix=_semi15k_250k +chain_affix=_semi15k_250k +exp=exp/semisup_15k +gmm=tri3 +hidden_dim=512 +cell_dim=512 +projection_dim=128 + +# training options +num_epochs=10 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +label_delay=5 +extra_opts="--chain.mmi-factor-schedule=1.0,1.0@0.1,0.5@0.2,0.5 --chain.smbr-factor-schedule=0.0,0.0@0.1,0.5@0.2,0.5" +chain_smbr_extra_opts= + +# decode options +extra_left_context=50 +extra_right_context=0 + +remove_egs=false +common_egs_dir= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate -1 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 7000 data/${train_set}_sp $lang $lat_dir $treedir || exit 1 +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm3 output-delay=$label_delay dim=$num_targets include-log-softmax=false max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + mkdir -p $dir/egs + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$common_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.0 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true" \ + --egs.chunk-width 160,140,110,80 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --chain.smbr-extra-opts="$chain_smbr_extra_opts" \ + --dir $dir --lang $lang $extra_opts || exit 1; +fi + +graph_dir=$dir/graph_poco_unk +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_poco_test_unk $dir $graph_dir +fi + +decode_suff= +if [ $stage -le 15 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk 160 \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}${decode_suff} || exit 1; + ) & + done +fi +wait; +exit 0; diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_50k_a.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_50k_a.sh new file mode 100755 index 00000000000..6bafc30f3aa --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_50k_a.sh @@ -0,0 +1,232 @@ +#!/bin/bash +set -e + +# This is fisher chain recipe for training a model on a subset of around 15 hours. +# This is similar to _d, but uses a phone LM UNK model + +# configs for 'chain' +stage=0 +tdnn_affix=7a +train_stage=-10 +get_egs_stage=-10 +decode_iter= +train_set=train_sup50k +unsupervised_set=train_unsup100_250k +semisup_train_set=semisup50k_250k +tree_affix=bi_i +nnet3_affix=_semi50k_250k +chain_affix=_semi50k_250k +exp=exp/semisup_50k +gmm=tri4a +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +# training options +num_epochs=10 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +self_repair_scale=0.00001 +label_delay=5 + +# decode options +extra_left_context=50 +extra_right_context=0 + +remove_egs=false +common_egs_dir= + +# End configuration section. + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate -1 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 7000 data/${train_set}_sp $lang $lat_dir $treedir || exit 1 +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm3 output-delay=$label_delay dim=$num_targets include-log-softmax=false max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + mkdir -p $dir/egs + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true" \ + --egs.chunk-width 160,140,110,80 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph_poco_unk +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_poco_test_unk $dir $graph_dir +fi + +decode_suff= +if [ $stage -le 15 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk 160 \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}${decode_suff} || exit 1; + ) & + done +fi +wait; +exit 0; diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_50k_b.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_50k_b.sh new file mode 100755 index 00000000000..aa0387cc1d4 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_50k_b.sh @@ -0,0 +1,232 @@ +#!/bin/bash +set -e + +# This is fisher chain recipe for training a model on a subset of around 15 hours. +# This is similar to _d, but uses a phone LM UNK model + +# configs for 'chain' +stage=0 +tdnn_affix=7b +train_stage=-10 +get_egs_stage=-10 +decode_iter= +train_set=train_sup50k +unsupervised_set=train_unsup100_250k +semisup_train_set=semisup50k_250k +tree_affix=bi_j +nnet3_affix=_semi50k_250k +chain_affix=_semi50k_250k +exp=exp/semisup_50k +gmm=tri4a +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +# training options +num_epochs=10 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +self_repair_scale=0.00001 +label_delay=5 + +# decode options +extra_left_context=50 +extra_right_context=0 + +remove_egs=false +common_egs_dir= + +# End configuration section. + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate -1 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 7000 data/${train_set}_sp $lang $lat_dir $treedir || exit 1 +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm3 output-delay=$label_delay dim=$num_targets include-log-softmax=false max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + mkdir -p $dir/egs + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true" \ + --egs.chunk-width 160,140,110,80 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph_poco +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_poco_test $dir $graph_dir +fi + +decode_suff= +if [ $stage -le 15 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk 160 \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}${decode_suff} || exit 1; + ) & + done +fi +wait; +exit 0; diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_50k_semisupervised_conf_a.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_50k_semisupervised_conf_a.sh new file mode 100644 index 00000000000..bab9e69bbf3 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_50k_semisupervised_conf_a.sh @@ -0,0 +1,511 @@ +#!/bin/bash + +# This script uses phone LM to model UNK. + +# Unsupervised set: train_unsup100k_250k +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 3,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +exp=exp/semisup_50k + +unsupervised_set=train_unsup100k_250k # set this to your choice of unsupervised data +supervised_set=train_sup50k +semisup_train_set=semisup50k_250k + +tdnn_affix=7a # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" +tree_affix=bi_i + +gmm=tri4a + +nnet3_affix=_semi50k_250k # affix for nnet3 and chain dir -- relates to i-vector used +chain_affix=_semi50k_250k + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg=150 # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1b # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +unsup_egs_opts= + +remove_egs=false +common_egs_dir= + +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +apply_deriv_weights=true +use_smart_splitting=true + +# training options +num_epochs=2 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +self_repair_scale=0.00001 +label_delay=5 + +# decode options +extra_left_context=50 +extra_right_context=0 + +decode_iter= + +do_finetuning=false + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +if $use_smart_splitting; then + comb_affix=${comb_affix:+${comb_affix}_smart} +else + comb_affix=${comb_affix:+${comb_affix}_naive} +fi + +RANDOM=0 + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + output-layer name=output input=lstm3 output-delay=$label_delay dim=$num_targets include-log-softmax=false max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine@$label_delay skip-in-init=true + output name=output-1 input=output.affine@$label_delay skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax@$label_delay skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax@$label_delay skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + chunk_left_context] +right_context=$[model_right_context + chunk_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +egs_left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +egs_right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +egs_left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +egs_right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${chain_affix}/${gmm}_${supervised_set}_unk_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${semisup_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} + +if [ -z "$unsup_egs_dir" ]; then + [ -z $unsup_frames_per_eg ] && [ ! -z "$frames_per_eg" ] && unsup_frames_per_eg=$frames_per_eg + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + if $use_smart_splitting; then + get_egs_script=steps/nnet3/chain/get_egs_split.sh + else + get_egs_script=steps/nnet3/chain/get_egs.sh + fi + + $get_egs_script --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${semisup_train_set}_sp_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 64 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${semisup_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs 4 \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph${test_graph_affix} +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 $test_lang $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --frames-per-chunk 160 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + $graph_dir data/${decode_set}_hires $dir/decode${test_graph_affix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --frames-per-chunk 150 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_50k_semisupervised_conf_b.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_50k_semisupervised_conf_b.sh new file mode 100644 index 00000000000..ebd6c090267 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_50k_semisupervised_conf_b.sh @@ -0,0 +1,511 @@ +#!/bin/bash + +# This script uses phone LM to model UNK. + +# Unsupervised set: train_unsup100k_250k +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 3,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +exp=exp/semisup_50k + +unsupervised_set=train_unsup100k_250k # set this to your choice of unsupervised data +supervised_set=train_sup50k +semisup_train_set=semisup50k_250k + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" +tree_affix=bi_j + +gmm=tri4a + +nnet3_affix=_semi50k_250k # affix for nnet3 and chain dir -- relates to i-vector used +chain_affix=_semi50k_250k + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg=150 # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1b # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +unsup_egs_opts= + +remove_egs=false +common_egs_dir= + +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +apply_deriv_weights=true +use_smart_splitting=true + +# training options +num_epochs=2 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +self_repair_scale=0.00001 +label_delay=5 + +# decode options +extra_left_context=50 +extra_right_context=0 + +decode_iter= + +do_finetuning=false + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +if $use_smart_splitting; then + comb_affix=${comb_affix:+${comb_affix}_smart} +else + comb_affix=${comb_affix:+${comb_affix}_naive} +fi + +RANDOM=0 + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + output-layer name=output input=lstm3 output-delay=$label_delay dim=$num_targets include-log-softmax=false max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine@$label_delay skip-in-init=true + output name=output-1 input=output.affine@$label_delay skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax@$label_delay skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax@$label_delay skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + chunk_left_context] +right_context=$[model_right_context + chunk_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +egs_left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +egs_right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +egs_left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +egs_right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${chain_affix}/${gmm}_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${semisup_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} + +if [ -z "$unsup_egs_dir" ]; then + [ -z $unsup_frames_per_eg ] && [ ! -z "$frames_per_eg" ] && unsup_frames_per_eg=$frames_per_eg + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + if $use_smart_splitting; then + get_egs_script=steps/nnet3/chain/get_egs_split.sh + else + get_egs_script=steps/nnet3/chain/get_egs.sh + fi + + $get_egs_script --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${semisup_train_set}_sp_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 64 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${semisup_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs 4 \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph${test_graph_affix} +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 $test_lang $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --frames-per-chunk 160 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + $graph_dir data/${decode_set}_hires $dir/decode${test_graph_affix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --frames-per-chunk 150 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/run_100k.sh b/egs/fisher_english/s5/local/semisup/run_100k.sh index 8458939cf3c..bbc5b1bf8cc 100644 --- a/egs/fisher_english/s5/local/semisup/run_100k.sh +++ b/egs/fisher_english/s5/local/semisup/run_100k.sh @@ -19,6 +19,7 @@ if [ ! -f data/train_sup/utt2spk ]; then exit 1 fi +false && { utils/subset_data_dir.sh --shortest data/train_sup 100000 data/train_sup_100kshort utils/subset_data_dir.sh data/train_sup_100kshort 10000 data/train_sup_10k utils/data/remove_dup_utts.sh 100 data/train_sup_10k data/train_sup_10k_nodup @@ -75,22 +76,37 @@ steps/train_sat.sh --cmd "$train_cmd" \ utils/copy_data_dir.sh data/train_unsup250k data/train_unsup100k_250k utils/combine_data.sh data/semisup100k_250k data/train_sup \ data/train_unsup100k_250k || exit 1 +} -if [ ! -f data/lang_test_poco_sup100k_big/G.carpa ]; then - local/fisher_train_lms_pocolm.sh \ - --text data/train_sup/text \ - --dir data/local/lm_sup100k +local/fisher_train_lms_pocolm.sh \ + --text data/train_sup/text \ + --dir data/local/pocolm_sup100k - local/fisher_create_test_lang.sh \ - --arpa-lm data/local/pocolm_sup100k/data/arpa/4gram_small.arpa.gz \ - --dir data/lang_test_poco_sup100k +local/fisher_create_test_lang.sh \ + --arpa-lm data/local/pocolm_sup100k/data/arpa/4gram_small.arpa.gz \ + --dir data/lang_poco_test_sup100k +lang_dirs=data/lang_poco_test_sup100k +if [ -f data/local/pocolm_sup100k/data/arpa/4gram_big.arpa.gz ]; then utils/build_const_arpa_lm.sh \ data/local/pocolm_sup100k/data/arpa/4gram_big.arpa.gz \ - data/lang_test_poco_sup100k data/lang_test_poco_sup100k_big + data/lang_poco_test_sup100k data/lang_poco_test_sup100k_big + lang_dirs="$lang_dirs data/lang_poco_test_sup100k_big" fi -local/run_unk_model.sh --lang-dirs "data/lang_test_poco_sup100k_big data/lang_test_poco_sup100k" || exit 1 +if [ ! -f data/lang_unk/words.txt ]; then + local/run_unk_model.sh +fi + +for lang_dir in $lang_dirs; do + rm -r ${lang_dir}_unk 2>/dev/null || true + mkdir -p ${lang_dir}_unk + cp -r data/lang_unk/* ${lang_dir}_unk + if [ -f ${lang_dir}/G.fst ]; then cp ${lang_dir}/G.fst ${lang_dir}_unk/G.fst; fi + if [ -f ${lang_dir}/G.carpa ]; then cp ${lang_dir}/G.carpa ${lang_dir}_unk/G.carpa; fi +done + +exit 0 local/semisup/chain/tuning/run_tdnn_100k.sh \ --train-set train_sup \ diff --git a/egs/fisher_english/s5/local/semisup/run_10k.sh b/egs/fisher_english/s5/local/semisup/run_10k.sh index a5a293f3ce2..b91c67cb711 100644 --- a/egs/fisher_english/s5/local/semisup/run_10k.sh +++ b/egs/fisher_english/s5/local/semisup/run_10k.sh @@ -13,7 +13,14 @@ train_stage=-10 set -o pipefail exp=exp/semisup_11k -false && { + +for f in data/train_sup/utt2spk data/train_unsup250k/utt2spk ]; do + if [ ! -f $f ]; then + echo "$0: Could not find $f" + exit 1 + fi +done + utils/subset_data_dir.sh --speakers data/train_sup 11000 data/train_sup11k || exit 1 utils/subset_data_dir.sh --shortest data/train_sup11k 5000 data/train_sup11k_short || exit 1 utils/subset_data_dir.sh data/train_sup11k 5500 data/train_sup11k_half || exit 1 @@ -54,7 +61,23 @@ steps/train_sat.sh --cmd "$train_cmd" \ )& utils/combine_data.sh data/semisup11k_250k data/train_sup11k data/train_unsup250k || exit 1 -} + +mkdir -p data/local/pocolm_ex250k + +utils/filter_scp.pl --exclude data/train_unsup250k/utt2spk \ + data/train/text > data/local/pocolm_ex250k/text.tmp + +local/fisher_train_lms_pocolm.sh \ + --text data/local/pocolm_ex250k/text.tmp \ + --dir data/local/pocolm_ex250k + +local/fisher_create_test_lang.sh \ + --arpa-lm data/local/pocolm_ex250k/data/arpa/4gram_small.arpa.gz \ + --dir data/lang_test_poco_ex250k + +utils/build_const_arpa_lm.sh \ + data/local/pocolm_ex250k/data/arpa/4gram_big.arpa.gz \ + data/lang_test_poco_ex250k data/lang_test_poco_ex250k_big local/semisup/chain/tuning/run_tdnn_11k.sh \ --ivector-train-set semisup11k_250k --train-set train_sup11k --stage $stage --train-stage $train_stage || exit 1 From 5e7ca16b5bd3e17894995e2373538026adb6054b Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Wed, 24 Jan 2018 12:18:28 -0500 Subject: [PATCH 115/174] semisup-smbr: Choose egs script inside train.py --- .../nnet3/train/chain_objf/acoustic_model.py | 6 ++- egs/wsj/s5/steps/nnet3/chain/train.py | 10 ++++- src/chainbin/chain-est-phone-lm.cc | 39 +++++-------------- 3 files changed, 21 insertions(+), 34 deletions(-) diff --git a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py index 97bd20b9ffd..8e38efef345 100644 --- a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py +++ b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py @@ -69,14 +69,15 @@ def generate_chain_egs(dir, data, lat_dir, egs_dir, alignment_subsampling_factor=3, online_ivector_dir=None, frames_per_iter=20000, frames_per_eg_str="20", srand=0, - egs_opts=None, cmvn_opts=None, transform_dir=None): + egs_opts=None, cmvn_opts=None, transform_dir=None, + get_egs_script="steps/nnet3/chain/get_egs.sh"): """Wrapper for steps/nnet3/chain/get_egs.sh See options in that script. """ common_lib.execute_command( - """steps/nnet3/chain/get_egs.sh {egs_opts} \ + """{get_egs_script} {egs_opts} \ --cmd "{command}" \ --cmvn-opts "{cmvn_opts}" \ --transform-dir "{transform_dir}" \ @@ -94,6 +95,7 @@ def generate_chain_egs(dir, data, lat_dir, egs_dir, --frames-per-eg {frames_per_eg_str} \ --srand {srand} \ {data} {dir} {lat_dir} {egs_dir}""".format( + get_egs_script=get_egs_script, command=run_opts.command, cmvn_opts=cmvn_opts if cmvn_opts is not None else '', transform_dir=(transform_dir diff --git a/egs/wsj/s5/steps/nnet3/chain/train.py b/egs/wsj/s5/steps/nnet3/chain/train.py index 6a0d772deb3..5981cdd846a 100755 --- a/egs/wsj/s5/steps/nnet3/chain/train.py +++ b/egs/wsj/s5/steps/nnet3/chain/train.py @@ -81,6 +81,9 @@ def get_args(): parser.add_argument("--chain.leaky-hmm-coefficient", type=float, dest='leaky_hmm_coefficient', default=0.00001, help="") + parser.add_argument("--chain.smbr-leaky-hmm-coefficient", type=float, + dest='smbr_leaky_hmm_coefficient', default=0.00001, + help="") parser.add_argument("--chain.apply-deriv-weights", type=str, dest='apply_deriv_weights', default=True, action=common_lib.StrToBoolAction, @@ -435,7 +438,8 @@ def train(args, run_opts): online_ivector_dir=args.online_ivector_dir, frames_per_iter=args.frames_per_iter, transform_dir=args.transform_dir, - stage=args.egs_stage) + stage=args.egs_stage, + get_egs_script=args.get_egs_script) if args.egs_dir is None: egs_dir = default_egs_dir @@ -614,7 +618,9 @@ def train(args, run_opts): max_deriv_time_relative=max_deriv_time_relative, l2_regularize=l2_regularize, xent_regularize=xent_regularize, - leaky_hmm_coefficient=args.leaky_hmm_coefficient, + leaky_hmm_coefficient=(args.smbr_leaky_hmm_coefficient + if smbr_factor > 0.0 + else args.leaky_hmm_coefficient), momentum=args.momentum, max_param_change=args.max_param_change, shuffle_buffer_size=args.shuffle_buffer_size, diff --git a/src/chainbin/chain-est-phone-lm.cc b/src/chainbin/chain-est-phone-lm.cc index db16cc4d51a..f16b3f4f14b 100644 --- a/src/chainbin/chain-est-phone-lm.cc +++ b/src/chainbin/chain-est-phone-lm.cc @@ -39,52 +39,31 @@ int main(int argc, char *argv[]) { " chain-est-phone-lm --leftmost-context-questions=dir/leftmost_questions.txt ark:- dir/phone_G.fst\n"; bool binary_write = true; - std::string scales_str; - LanguageModelOptions lm_opts; ParseOptions po(usage); po.Register("binary", &binary_write, "Write output in binary mode"); - po.Register("scales", &scales_str, "Comma-separated list of scales " - "for the different sources of phone sequences"); lm_opts.Register(&po); po.Read(argc, argv); - if (po.NumArgs() < 2) { + if (po.NumArgs() != 2) { po.PrintUsage(); exit(1); } - int32 num_sources = po.NumArgs() - 1; - - std::string lm_fst_wxfilename = po.GetArg(po.NumArgs()); - - std::vector scales(num_sources, 1); - if (!scales_str.empty()) { - std::vector parts; - SplitStringToVector(scales_str, ":,", false, &parts); - if (parts.size() != num_sources) { - KALDI_ERR << "--scales must have exactly num-sources = " - << num_sources << " scales."; - } - for (size_t i = 0; i < parts.size(); i++) { - scales[i] = std::atoi(parts[i].c_str()); - } - } + std::string phone_seqs_rspecifier = po.GetArg(1), + lm_fst_wxfilename = po.GetArg(2); + LanguageModelEstimator lm_estimator(lm_opts); - for (int32 n = 1; n <= num_sources; n++) { - std::string phone_seqs_rspecifier = po.GetArg(n); - SequentialInt32VectorReader phones_reader(phone_seqs_rspecifier); - KALDI_LOG << "Reading phone sequences"; - for (; !phones_reader.Done(); phones_reader.Next()) { - const std::vector &phone_seq = phones_reader.Value(); - lm_estimator.AddCounts(phone_seq, scales[n-1]); - } + SequentialInt32VectorReader phones_reader(phone_seqs_rspecifier); + KALDI_LOG << "Reading phone sequences"; + for (; !phones_reader.Done(); phones_reader.Next()) { + const std::vector &phone_seq = phones_reader.Value(); + lm_estimator.AddCounts(phone_seq); } - KALDI_LOG << "Estimating phone LM"; fst::StdVectorFst fst; lm_estimator.Estimate(&fst); From 63c47b1d31b37feba5d04691027971a861de30a1 Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Wed, 24 Jan 2018 12:20:43 -0500 Subject: [PATCH 116/174] semisup-smbr: Update aspire recipe --- .../s5/local/nnet3/run_ivector_common.sh | 49 ++++++------------- 1 file changed, 16 insertions(+), 33 deletions(-) diff --git a/egs/aspire/s5/local/nnet3/run_ivector_common.sh b/egs/aspire/s5/local/nnet3/run_ivector_common.sh index fe8143e657d..9ccff4d84e0 100755 --- a/egs/aspire/s5/local/nnet3/run_ivector_common.sh +++ b/egs/aspire/s5/local/nnet3/run_ivector_common.sh @@ -8,11 +8,8 @@ snrs="20:10:15:5:0" foreground_snrs="20:10:15:5:0" background_snrs="20:10:15:5:0" num_data_reps=3 -db_string="'air' 'rwcp' 'rvb2014'" # RIR dbs to be used in the experiment - # only dbs used for ASpIRE submission system have been used here -RIR_home=db/RIR_databases/ # parent directory of the RIR databases files -download_rirs=true # download the RIR databases from the urls or assume they are present in the RIR_home directory base_rirs="simulated" +prepare_aspire_sets=false set -e . ./cmd.sh @@ -61,20 +58,16 @@ if [ $stage -le 1 ]; then --source-sampling-rate 8000 \ data/${data_dir} data/${data_dir}_rvb done - # create the dev, test and eval sets from the aspire recipe - local/multi_condition/aspire_data_prep.sh - - # copy the alignments for the newly created utterance ids - ali_dirs= - for i in `seq 1 $num_data_reps`; do - local/multi_condition/copy_ali_dir.sh --cmd "$decode_cmd" --utt-prefix "rev${i}_" exp/tri5a exp/tri5a_temp_$i || exit 1; - ali_dirs+=" exp/tri5a_temp_$i" - done - steps/combine_ali_dirs.sh data/train_rvb exp/tri5a_rvb_ali $ali_dirs || exit 1; + if $prepare_aspire_sets; then + # create the dev, test and eval sets from the aspire recipe + local/multi_condition/aspire_data_prep.sh + fi +fi - # copy the alignments for training the 100k system (from tri4a) - local/multi_condition/copy_ali_dir.sh --utt-prefix "rev1_" exp/tri4a exp/tri4a_rvb || exit 1; +aspire_sets= +if $prepare_aspire_sets; then + aspire_sets=dev_aspire fi if [ $stage -le 2 ]; then @@ -84,7 +77,7 @@ if [ $stage -le 2 ]; then utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/egs/aspire-$date/s5/$mfccdir/storage $mfccdir/storage fi - for data_dir in train_rvb dev_rvb test_rvb dev_aspire dev test ; do + for data_dir in train_rvb dev_rvb test_rvb dev test $aspire_sets; do utils/copy_data_dir.sh data/$data_dir data/${data_dir}_hires steps/make_mfcc.sh --nj 70 --mfcc-config conf/mfcc_hires.conf \ --cmd "$train_cmd" data/${data_dir}_hires \ @@ -94,32 +87,22 @@ if [ $stage -le 2 ]; then utils/validate_data_dir.sh data/${data_dir}_hires done - # want the 100k subset to exactly match train_100k, since we'll use its alignments. - awk -v p='rev1_' '{printf "%s%s\n", p, $1}' data/train_100k/utt2spk > uttlist - #while read line; do grep $line data/train_rvb_hires/utt2spk|head -1; done < uttlist |awk '{print $1}' > uttlist2 - #mv uttlist2 uttlist - utils/subset_data_dir.sh --utt-list uttlist \ - data/train_rvb_hires data/train_rvb_hires_100k - rm uttlist + utils/subset_data_dir.sh data/train_rvb_hires 100000 data/train_rvb_hires_100k utils/subset_data_dir.sh data/train_rvb_hires 30000 data/train_rvb_hires_30k fi if [ $stage -le 3 ]; then - # We need to build a small system just because we need the LDA+MLLT transform - # to train the diag-UBM on top of. We use --num-iters 13 because after we get - # the transform (12th iter is the last), any further training is pointless. - steps/train_lda_mllt.sh --cmd "$train_cmd" --num-iters 13 \ + steps/online/nnet2/get_pca_transform.sh --cmd "$train_cmd" \ --splice-opts "--left-context=3 --right-context=3" \ - 5000 10000 data/train_rvb_hires_100k data/lang exp/tri4a_rvb exp/nnet3/tri5a + --max-utts 30000 --subsample 2 \ + data/train_rvb_hires exp/nnet3/pca_transform fi - if [ $stage -le 4 ]; then # To train a diagonal UBM we don't need very much data, so use the smallest - # subset. the input directory exp/nnet3/tri5a is only needed for - # the splice-opts and the LDA transform. + # subset. steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 --num-frames 400000 \ - data/train_rvb_hires_30k 512 exp/nnet3/tri5a \ + data/train_rvb_hires_30k 512 exp/nnet3/pca_transform \ exp/nnet3/diag_ubm fi From e8fc4f764b710856fc05c1a4d233391ecebdb9b7 Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Wed, 24 Jan 2018 13:18:21 -0500 Subject: [PATCH 117/174] Minor fixes --- .../local/semisup/chain/tuning/run_tdnn_lstm_100k_smbr_a.sh | 5 ++++- egs/wsj/s5/steps/libs/nnet3/report/log_parse.py | 1 - 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_smbr_a.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_smbr_a.sh index 5171890b981..d704894baa6 100755 --- a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_smbr_a.sh +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_smbr_a.sh @@ -30,6 +30,8 @@ xent_regularize=0.025 label_delay=5 extra_opts="--chain.mmi-factor-schedule=1.0,1.0@0.1,0.5@0.2,0.5 --chain.smbr-factor-schedule=0.0,0.0@0.1,0.5@0.2,0.5" chain_smbr_extra_opts= +smbr_leaky_hmm_coefficient=0.00001 +leaky_hmm_coefficient=0.1 # decode options extra_left_context=50 @@ -163,7 +165,8 @@ if [ $stage -le 13 ]; then --feat.online-ivector-dir $train_ivector_dir \ --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ --chain.xent-regularize $xent_regularize \ - --chain.leaky-hmm-coefficient 0.1 \ + --chain.leaky-hmm-coefficient $leaky_hmm_coefficient \ + --chain.smbr-leaky-hmm-coefficient $smbr_leaky_hmm_coefficient \ --chain.l2-regularize 0.0 \ --chain.apply-deriv-weights false \ --chain.lm-opts="--num-extra-lm-states=2000" \ diff --git a/egs/wsj/s5/steps/libs/nnet3/report/log_parse.py b/egs/wsj/s5/steps/libs/nnet3/report/log_parse.py index a7f9d678378..65d0bcd14f3 100755 --- a/egs/wsj/s5/steps/libs/nnet3/report/log_parse.py +++ b/egs/wsj/s5/steps/libs/nnet3/report/log_parse.py @@ -387,7 +387,6 @@ def parse_prob_logs(exp_dir, key='accuracy', output="output", field=0): if not train_objf: raise KaldiLogParseException("Could not find any values at field {f} with {k} in " " {l}".format(f=field, k=key, l=train_prob_files)) - " {l}".format(k=key, l=train_prob_files)) for line in valid_prob_strings.split('\n'): mat_obj = parse_regex.search(line) From ce3cbab528f14db3bcaa4d15391004a101a0e19b Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Thu, 25 Jan 2018 18:20:21 -0500 Subject: [PATCH 118/174] Simplifying recipe --- egs/swbd/s5c/local/run_asr_segmentation.sh | 68 +++++++-------- .../s5c/local/run_cleanup_segmentation.sh | 15 ++-- .../segmentation/combine_targets_dirs.sh | 83 ------------------- .../local/segmentation/copy_targets_dir.sh | 77 ----------------- .../tuning/train_lstm_asr_sad_1a.sh | 3 +- .../tuning/train_stats_asr_sad_1a.sh | 13 +-- .../segmentation/combine_targets_dirs.sh | 55 ++++++++++++ .../s5/steps/segmentation/copy_targets_dir.sh | 46 ++++++++++ .../segmentation/detect_speech_activity.sh | 5 +- .../steps/segmentation/prepare_targets_gmm.sh | 3 +- 10 files changed, 148 insertions(+), 220 deletions(-) delete mode 100755 egs/swbd/s5c/local/segmentation/combine_targets_dirs.sh delete mode 100755 egs/swbd/s5c/local/segmentation/copy_targets_dir.sh create mode 100755 egs/wsj/s5/steps/segmentation/combine_targets_dirs.sh create mode 100755 egs/wsj/s5/steps/segmentation/copy_targets_dir.sh diff --git a/egs/swbd/s5c/local/run_asr_segmentation.sh b/egs/swbd/s5c/local/run_asr_segmentation.sh index 7129e905480..21c20b0a423 100755 --- a/egs/swbd/s5c/local/run_asr_segmentation.sh +++ b/egs/swbd/s5c/local/run_asr_segmentation.sh @@ -10,7 +10,7 @@ lang=data/lang # Must match the one used to train the models lang_test=data/lang_nosp_sw1_tg # Lang directory for decoding. -data_dir=data/train +data_dir=data/train # Model directory used to align the $data_dir to get target labels for training # SAD. This should typically be a speaker-adapted system. sat_model_dir=exp/tri4 @@ -37,7 +37,6 @@ nstage=-10 train_stage=-10 test_stage=-10 num_data_reps=2 -base_rirs=simulated affix=_1a stage=-1 nj=80 @@ -113,6 +112,7 @@ if [ $stage -le 3 ]; then --nj 80 --reco-nj 40 --lang-test $lang_test \ --garbage-phones-list $dir/garbage_phones.txt \ --silence-phones-list $dir/silence_phones.txt \ + --merge-weights $merge_weights \ $lang $data_dir $whole_data_dir $sat_model_dir $model_dir $dir fi @@ -124,20 +124,14 @@ if [ $stage -le 4 ]; then fi rvb_opts=() - if [ "$base_rirs" == "simulated" ]; then - # This is the config for the system using simulated RIRs and point-source noises - rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/smallroom/rir_list") - rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/mediumroom/rir_list") - rvb_opts+=(--noise-set-parameters RIRS_NOISES/pointsource_noises/noise_list) - else - # This is the config for the JHU ASpIRE submission system - rvb_opts+=(--rir-set-parameters "1.0, RIRS_NOISES/real_rirs_isotropic_noises/rir_list") - rvb_opts+=(--noise-set-parameters RIRS_NOISES/real_rirs_isotropic_noises/noise_list) - fi + # This is the config for the system using simulated RIRs and point-source noises + rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/smallroom/rir_list") + rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/mediumroom/rir_list") + rvb_opts+=(--noise-set-parameters RIRS_NOISES/pointsource_noises/noise_list) foreground_snrs="20:10:15:5:0" background_snrs="20:10:15:5:0" - num_reps=1 + num_data_reps=1 # corrupt the data to generate multi-condition data # for data_dir in train dev test; do python steps/data/reverberate_data_dir.py \ @@ -148,41 +142,36 @@ if [ $stage -le 4 ]; then --speech-rvb-probability 0.5 \ --pointsource-noise-addition-probability 0.5 \ --isotropic-noise-addition-probability 0.7 \ - --num-replications $num_reps \ + --num-replications $num_data_reps \ --max-noises-per-minute 4 \ --source-sampling-rate 8000 \ $whole_data_dir $rvb_data_dir + rvb_dirs=() for i in `seq 1 $num_data_reps`; do - local/segmentation/copy_targets_dir.sh --cmd "$decode_cmd" --utt-prefix "rev${i}_" exp/segmentation_1a/train_whole_combined_targets_sub3 exp/segmentation_1a/train_whole_combined_targets_sub3_temp_$i || exit 1; - rvb_dirs+=" exp/segmentation_1a/train_whole_combined_targets_sub3_temp_$i" + steps/segmentation/copy_targets_dir.sh --utt-prefix "rev${i}_" \ + exp/segmentation_1a/train_whole_combined_targets_sub3 \ + exp/segmentation_1a/train_whole_combined_targets_sub3_temp_$i || exit 1; + rvb_dirs+=(exp/segmentation_1a/train_whole_combined_targets_sub3_temp_$i) done - local/segmentation/combine_targets_dirs.sh $rvb_data_dir exp/segmentation_1a/train_whole_combined_targets_sub3_rvb $rvb_dirs || exit 1; - cp exp/segmentation_1a/train_whole_combined_targets_sub3_rvb/targets.scp exp/segmentation_1a/ + steps/segmentation/combine_targets_dirs.sh \ + $rvb_data_dir exp/segmentation_1a/train_whole_combined_targets_sub3_rvb \ + $rvb_dirs || exit 1; fi if [ $stage -le 5 ]; then utils/copy_data_dir.sh ${rvb_data_dir} ${rvb_data_dir}_hires - steps/make_mfcc.sh --mfcc-config conf/mfcc_hires.conf --nj 10 \ + steps/make_mfcc.sh --mfcc-config conf/mfcc_hires.conf --nj 80 \ ${rvb_data_dir}_hires steps/compute_cmvn_stats.sh ${rvb_data_dir}_hires fi -# if [ $stage -le 6 ]; then -# # Train a TDNN-LSTM network for SAD -# local/segmentation/tuning/train_lstm_asr_sad_1a.sh \ -# --stage $nstage --train-stage $train_stage \ -# --targets-dir $dir \ -# --data-dir ${rvb_data_dir}_hires -# fi - if [ $stage -le 6 ]; then # Train a STATS-pooling network for SAD - local/segmentation/tuning/train_stats_asr_sad_1a.sh \ --stage $nstage --train-stage $train_stage \ - --targets-dir $dir \ + --targets-dir exp/segmentation_1a/train_whole_combined_targets_sub3_rvb \ --data-dir ${rvb_data_dir}_hires fi @@ -199,30 +188,29 @@ if [ $stage -le 7 ]; then --extra-left-context-initial 0 --extra-right-context-final 0 \ --nj 32 --acwt 0.3 --stage $test_stage \ data/eval2000 \ - exp/segmentation_1a/tdnn_stats_asr_sad_1a2 \ + exp/segmentation_1a/tdnn_stats_asr_sad_1a \ mfcc_hires \ - exp/segmentation_1a/tdnn_stats_asr_sad_1a2/{,eval2000} + exp/segmentation_1a/tdnn_stats_asr_sad_1a/{,eval2000} fi if [ $stage -le 8 ]; then # Do some diagnostics steps/segmentation/evaluate_segmentation.pl data/eval2000/segments \ - exp/segmentation_1a/tdnn_stats_asr_sad_1a2/eval2000_seg/segments &> \ - exp/segmentation_1a/tdnn_stats_asr_sad_1a2/eval2000_seg/evalutate_segmentation.log + exp/segmentation_1a/tdnn_stats_asr_sad_1a/eval2000_seg/segments &> \ + exp/segmentation_1a/tdnn_stats_asr_sad_1a/eval2000_seg/evalutate_segmentation.log steps/segmentation/convert_utt2spk_and_segments_to_rttm.py \ - exp/segmentation_1a/tdnn_stats_asr_sad_1a2/eval2000_seg/utt2spk \ - exp/segmentation_1a/tdnn_stats_asr_sad_1a2/eval2000_seg/segments \ - exp/segmentation_1a/tdnn_stats_asr_sad_1a2/eval2000_seg/sys.rttm + exp/segmentation_1a/tdnn_stats_asr_sad_1a/eval2000_seg/utt2spk \ + exp/segmentation_1a/tdnn_stats_asr_sad_1a/eval2000_seg/segments \ + exp/segmentation_1a/tdnn_stats_asr_sad_1a/eval2000_seg/sys.rttm # export PATH=$PATH:$KALDI_ROOT/tools/sctk/bin # md-eval.pl -c 0.25 -r $eval2000_rttm_file \ -# -s exp/segmentation_1a/tdnn_stats_asr_sad_1a2/eval2000_seg/sys.rttm > \ -# exp/segmentation_1a/tdnn_stats_asr_sad_1a2/eval2000_seg/md_eval.log +# -s exp/segmentation_1a/tdnn_stats_asr_sad_1a/eval2000_seg/sys.rttm > \ +# exp/segmentation_1a/tdnn_stats_asr_sad_1a/eval2000_seg/md_eval.log fi if [ $stage -le 9 ]; then - utils/copy_data_dir.sh exp/segmentation_1a/tdnn_stats_asr_sad_1a2/eval2000_seg \ + utils/copy_data_dir.sh exp/segmentation_1a/tdnn_stats_asr_sad_1a/eval2000_seg \ data/eval2000.seg_asr_sad_1a fi - diff --git a/egs/swbd/s5c/local/run_cleanup_segmentation.sh b/egs/swbd/s5c/local/run_cleanup_segmentation.sh index 8b08422d277..c879a55d16a 100755 --- a/egs/swbd/s5c/local/run_cleanup_segmentation.sh +++ b/egs/swbd/s5c/local/run_cleanup_segmentation.sh @@ -1,8 +1,8 @@ #!/bin/bash -# 2017 Nagendra Kumar Goel -# 2016 Vimal Manohar -# 2016 Johns Hopkins University (author: Daniel Povey) +# Copyright 2016 Vimal Manohar +# 2016 Johns Hopkins University (author: Daniel Povey) +# 2017 Nagendra Kumar Goel # Apache 2.0 # This script demonstrates how to re-segment training data selecting only the @@ -23,9 +23,9 @@ set -u stage=0 cleanup_stage=0 -data=data/train +data=data/train_nodup cleanup_affix=cleaned -srcdir=exp/tri4_mmi_b0.1 +srcdir=exp/tri4 langdir=data/lang_sw1_tg nj=100 decode_nj=16 @@ -42,7 +42,8 @@ cleaned_dir=${srcdir}_${cleanup_affix} if [ $stage -le 1 ]; then # This does the actual data cleanup. - steps/cleanup/clean_and_segment_data.sh --stage $cleanup_stage --nj $nj --cmd "$train_cmd" \ + steps/cleanup/clean_and_segment_data.sh --stage $cleanup_stage \ + --nj $nj --cmd "$train_cmd" \ $data $langdir $srcdir $dir $cleaned_data fi @@ -53,5 +54,5 @@ fi if [ $stage -le 3 ]; then steps/train_sat.sh --cmd "$train_cmd" \ - 5000 100000 $cleaned_data $langdir ${srcdir}_ali_${cleanup_affix} ${cleaned_dir} + 11500 200000 $cleaned_data $langdir ${srcdir}_ali_${cleanup_affix} ${cleaned_dir} fi diff --git a/egs/swbd/s5c/local/segmentation/combine_targets_dirs.sh b/egs/swbd/s5c/local/segmentation/combine_targets_dirs.sh deleted file mode 100755 index 48c4ce93db0..00000000000 --- a/egs/swbd/s5c/local/segmentation/combine_targets_dirs.sh +++ /dev/null @@ -1,83 +0,0 @@ -#!/bin/bash -# Copyright 2017 Nagendra Kumar Goel -# Apache 2.0. - -# This srcipt operates on targets directories, such as exp/segmentation_1a/train_whole_combined_targets_sub3 -# the output is a new targets dir which has targets from all the input targets dirs - -# Begin configuration section. -cmd=run.pl -extra_files= -num_jobs=4 -# End configuration section. -echo "$0 $@" # Print the command line for logging - -if [ -f path.sh ]; then . ./path.sh; fi -. parse_options.sh || exit 1; - -if [[ $# -lt 3 ]]; then - echo "Usage: $0 [options] ..." - echo "e.g.: $0 --num-jobs 32 data/train exp/targets_combined exp/targets_1 exp/targets_2" - echo "Options:" - echo " --extra-files # specify addtional files in 'src-targets-dir1' to copy" - echo " --num-jobs # number of jobs used to split the data directory." - echo " Note, files that don't appear in the first source dir will not be added even if they appear in later ones." - echo " Other than alignments, only files from the first src ali dir are copied." - exit 1; -fi - -data=$1; -shift; -dest=$1; -shift; -first_src=$1; - -mkdir -p $dest; -rm $dest/{targets.*.ark,frame_subsampling_factor} 2>/dev/null - -cp $first_src/frame_subsampling_factor $dest 2>/dev/null - -export LC_ALL=C - -for dir in $*; do - if [ ! -f $dir/targets.1.ark ]; then - echo "$0: check if targets (targets.*.ark) are present in $dir." - exit 1; - fi -done - -for dir in $*; do - for f in frame_subsampling_factor; do - diff $first_src/$f $dir/$f 1>/dev/null 2>&1 - if [ $? -ne 0 ]; then - echo "$0: Cannot combine alignment directories with different $f files." - fi - done -done - -for f in frame_subsampling_factor $extra_files; do - if [ ! -f $first_src/$f ]; then - echo "combine_targets_dir.sh: no such file $first_src/$f" - exit 1; - fi - cp $first_src/$f $dest/ -done - -src_id=0 -temp_dir=$dest/temp -[ -d $temp_dir ] && rm -r $temp_dir; -mkdir -p $temp_dir -echo "$0: dumping targets in each source directory as single archive and index." -for dir in $*; do - src_id=$((src_id + 1)) - cur_num_jobs=$(ls $dir/targets.*.ark | wc -l) || exit 1; - tgts=$(for n in $(seq $cur_num_jobs); do echo -n "$dir/targets.$n.ark "; done) - $cmd $dir/log/copy_targets.log \ - copy-matrix "ark:cat $tgts|" \ - ark,scp:$temp_dir/targets.$src_id.ark,$temp_dir/targets.$src_id.scp || exit 1; -done -sort -m $temp_dir/targets.*.scp > $dest/targets.scp || exit 1; - - -echo "Combined targets and stored in $dest" -exit 0 diff --git a/egs/swbd/s5c/local/segmentation/copy_targets_dir.sh b/egs/swbd/s5c/local/segmentation/copy_targets_dir.sh deleted file mode 100755 index 81c9193d22e..00000000000 --- a/egs/swbd/s5c/local/segmentation/copy_targets_dir.sh +++ /dev/null @@ -1,77 +0,0 @@ -#!/bin/bash - -# Copyright 2017 Nagendra Kumar Goel -# 2014 Johns Hopkins University (author: Nagendra K Goel) -# Apache 2.0 - -# This script operates on a directory, such as in exp/segmentation_1a/train_whole_combined_targets_rev1, -# that contains some subset of the following files: -# targets.X.ark -# frame_subsampling_factor -# It copies to another directory, possibly adding a specified prefix or a suffix -# to the utterance names. - - -# begin configuration section -utt_prefix= -utt_suffix= -cmd=run.pl -# end configuration section - -. utils/parse_options.sh - -if [ $# != 2 ]; then - echo "Usage: " - echo " $0 [options] " - echo "e.g.:" - echo " $0 --utt-prefix=1- exp/segmentation_1a/train_whole_combined_targets_sub3 exp/segmentation_1a/train_whole_combined_targets_sub3_rev1" - echo "Options" - echo " --utt-prefix= # Prefix for utterance ids, default empty" - echo " --utt-suffix= # Suffix for utterance ids, default empty" - exit 1; -fi - - -export LC_ALL=C - -src_dir=$1 -dest_dir=$2 - -mkdir -p $dest_dir - -if [ ! -f $src_dir/targets.1.ark ]; then - echo "copy_targets_dir.sh: no such files $src_dir/targets.1.ark" - exit 1; -fi - -for f in frame_subsampling_factor; do - if [ ! -f $src_dir/$f ]; then - echo "$0: no such file $src_dir/$f this might be serious error." - continue - fi - cp $src_dir/$f $dest_dir/ -done - -nj=$(ls $src_dir/targets.*.ark | wc -l) -mkdir -p $dest_dir/temp -cat << EOF > $dest_dir/temp/copy_targets.sh -set -e; -id=\$1 -echo "$src_dir/targets.\$id.ark" -copy-matrix ark:$src_dir/targets.\$id.ark ark,t:- | \ -python -c " -import sys -for line in sys.stdin: - parts = line.split() - if \"[\" not in line: - print line.rstrip() - else: - print '$utt_prefix{0}$utt_suffix {1}'.format(parts[0], ' '.join(parts[1:])) -" | \ - copy-matrix ark,t:- ark:$dest_dir/targets.\$id.ark || exit 1; -set +o pipefail; # unset the pipefail option. -EOF -chmod +x $dest_dir/temp/copy_targets.sh -$cmd -v PATH JOB=1:$nj $dest_dir/temp/copy_targets.JOB.log $dest_dir/temp/copy_targets.sh JOB || exit 1; - -echo "$0: copied targets from $src_dir to $dest_dir" diff --git a/egs/swbd/s5c/local/segmentation/tuning/train_lstm_asr_sad_1a.sh b/egs/swbd/s5c/local/segmentation/tuning/train_lstm_asr_sad_1a.sh index 74697df099f..13318756e43 100755 --- a/egs/swbd/s5c/local/segmentation/tuning/train_lstm_asr_sad_1a.sh +++ b/egs/swbd/s5c/local/segmentation/tuning/train_lstm_asr_sad_1a.sh @@ -9,8 +9,6 @@ set -o pipefail set -u -. ./cmd.sh - # At this script level we don't support not running on GPU, as it would be painfully slow. # If you want to run without GPU you'd have to call train_tdnn.sh with --gpu false, # --num-threads 16 and --minibatch-size 128. @@ -50,6 +48,7 @@ affix=1a data_dir=exp/segmentation_1a/train_whole_hires_bp targets_dir=exp/segmentation_1a/train_whole_combined_targets_sub3 +. ./cmd.sh if [ -f ./path.sh ]; then . ./path.sh; fi . ./utils/parse_options.sh diff --git a/egs/swbd/s5c/local/segmentation/tuning/train_stats_asr_sad_1a.sh b/egs/swbd/s5c/local/segmentation/tuning/train_stats_asr_sad_1a.sh index 3254929306f..96009c69374 100755 --- a/egs/swbd/s5c/local/segmentation/tuning/train_stats_asr_sad_1a.sh +++ b/egs/swbd/s5c/local/segmentation/tuning/train_stats_asr_sad_1a.sh @@ -33,8 +33,8 @@ relu_dim=256 num_epochs=1 initial_effective_lrate=0.0003 final_effective_lrate=0.00003 -num_jobs_initial=1 -num_jobs_final=1 +num_jobs_initial=3 +num_jobs_final=8 remove_egs=true max_param_change=0.2 # Small max-param change for small network @@ -49,7 +49,7 @@ affix=1a2 data_dir=exp/segmentation_1a/train_whole_rvb_hires targets_dir=exp/segmentation_1a/train_whole_combined_targets_sub3 -. cmd.sh +. ./cmd.sh if [ -f ./path.sh ]; then . ./path.sh; fi . ./utils/parse_options.sh @@ -135,9 +135,10 @@ if [ $stage -le 6 ]; then fi if [ $stage -le 7 ]; then - copy-feats scp:$targets_dir/targets.scp ark:- | \ - matrix-sum-rows ark:- ark:- | vector-sum --binary=false ark:- - | \ - awk '{print " [ "$2" "$3" "$4" ]"}' > $dir/post_output.vec + # Use a subset to compute prior over the output targets + $cmd $dir/log/get_priors.log \ + matrix-sum-rows "scp:utils/subset_scp.pl --quiet 1000 $targets_dir/targets.scp |" \ + ark:- \| vector-sum --binary=false ark:- $dir/post_output.vec || exit 1 echo 3 > $dir/frame_subsampling_factor fi diff --git a/egs/wsj/s5/steps/segmentation/combine_targets_dirs.sh b/egs/wsj/s5/steps/segmentation/combine_targets_dirs.sh new file mode 100755 index 00000000000..f6be21e16f3 --- /dev/null +++ b/egs/wsj/s5/steps/segmentation/combine_targets_dirs.sh @@ -0,0 +1,55 @@ +#!/bin/bash + +# Copyright 2017 Nagendra Kumar Goel +# 2018 Vimal Manohar +# Apache 2.0. + +# This script combines targets directory into a new targets directory +# containing targets from all the input targets directories. + +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + +if [ $# -lt 3 ]; then + echo "Usage: $0 [options] ..." + echo "e.g.: $0 data/train exp/targets_combined exp/targets_1 exp/targets_2" + exit 1; +fi + +export LC_ALL=C + +data=$1; +shift; +dest=$1; +shift; +first_src=$1; + +mkdir -p $dest; +rm -f $dest/{targets.*.ark,frame_subsampling_factor} 2>/dev/null + +frame_subsampling_factor=1 +if [ -f $first_src/frame_subsampling_factor ]; then + cp $first_src/frame_subsampling_factor $dest + frame_subsampling_factor=$(cat $dest/frame_subsampling_factor) +fi + +for d in $*; do + this_frame_subsampling_factor=1 + if [ -f $d/frame_subsampling_factor ]; then + this_frame_subsampling_factor=$(cat $d/frame_subsampling_factor) + fi + + if [ $this_frame_subsampling_factor != $frame_subsampling_factor ]; then + echo "$0: Cannot combine targets directories with different frame-subsampling-factors" 1>&2 + exit 1 + fi + + cat $d/targets.scp +done | sort -k1,1 > $dest/targets.scp || exit 1 + +steps/segmentation/verify_targets_dir.sh $data $dest || exit 1 + +echo "Combined targets and stored in $dest" +exit 0 diff --git a/egs/wsj/s5/steps/segmentation/copy_targets_dir.sh b/egs/wsj/s5/steps/segmentation/copy_targets_dir.sh new file mode 100755 index 00000000000..f15206b1f7d --- /dev/null +++ b/egs/wsj/s5/steps/segmentation/copy_targets_dir.sh @@ -0,0 +1,46 @@ +#!/bin/bash + +# Copyright 2017 Nagendra Kumar Goel +# 2014 Johns Hopkins University (author: Nagendra K Goel) +# Apache 2.0 + +# This script makes a copy of targets directory (by copying targets.scp), +# possibly adding a specified prefix or a suffix to the utterance names. + +# begin configuration section +utt_prefix= +utt_suffix= +# end configuration section + +if [ -f ./path.sh ]; then . ./path.sh; fi +. ./utils/parse_options.sh + +if [ $# != 2 ]; then + echo "Usage: " + echo " $0 [options] " + echo "e.g.:" + echo " $0 --utt-prefix=1- exp/segmentation_1a/train_whole_combined_targets_sub3 exp/segmentation_1a/train_whole_combined_targets_sub3_rev1" + echo "Options" + echo " --utt-prefix= # Prefix for utterance ids, default empty" + echo " --utt-suffix= # Suffix for utterance ids, default empty" + exit 1; +fi + +export LC_ALL=C + +srcdir=$1 +destdir=$2 + +mkdir -p $destdir + +if [ -f $srcdir/frame_subsampling_factor ]; then + cp $srcdir/frame_subsampling_factor $destdir +fi + +cat $srcdir/targets.scp | awk -v p=$utt_prefix -v s=$utt_suffix \ + '{printf("%s %s%s%s\n", $1, p, $1, s);}' > $destdir/utt_map + +cat $srcdir/targets.scp | utils/apply_map.pl -f 1 $destdir/utt_map | \ + sort -k1,1 > $destdir/targets.scp + +echo "$0: copied targets from $srcdir to $destdir" diff --git a/egs/wsj/s5/steps/segmentation/detect_speech_activity.sh b/egs/wsj/s5/steps/segmentation/detect_speech_activity.sh index 9bc8eea675c..60e3df20df2 100755 --- a/egs/wsj/s5/steps/segmentation/detect_speech_activity.sh +++ b/egs/wsj/s5/steps/segmentation/detect_speech_activity.sh @@ -13,17 +13,16 @@ set -e set -o pipefail set -u -. ./cmd.sh if [ -f ./path.sh ]; then . ./path.sh; fi affix= # Affix for the segmentation nj=32 -cmd=$decode_cmd +cmd=queue.pl stage=-1 # Feature options (Must match training) mfcc_config=conf/mfcc_hires.conf -feat_affix=hires # Affix for the type of feature used +feat_affix= # Affix for the type of feature used convert_data_dir_to_whole=true # If true, the input data directory is # first converted to whole data directory (i.e. whole recordings) diff --git a/egs/wsj/s5/steps/segmentation/prepare_targets_gmm.sh b/egs/wsj/s5/steps/segmentation/prepare_targets_gmm.sh index de19cfc6772..f8557a70177 100755 --- a/egs/wsj/s5/steps/segmentation/prepare_targets_gmm.sh +++ b/egs/wsj/s5/steps/segmentation/prepare_targets_gmm.sh @@ -1,7 +1,6 @@ #! /bin/bash # Copyright 2017 Vimal Manohar -# 2017 Nagendra Kumar Goel # Apache 2.0 # This script prepares targets for training neural network for @@ -211,7 +210,7 @@ if [ $stage -le 5 ]; then # the speech / silence decisions, not the exact word sequences. steps/decode.sh --cmd "$decode_cmd --mem 2G" --nj $nj \ --max-active 1000 --beam 10.0 \ - --skip-scoring true \ + --decode-extra-opts "--word-determinize=false" --skip-scoring true \ $graph_dir $uniform_seg_data_dir $decode_dir fi From b43e5dcb87ed7d04725c8e69c82f9e1779ca20a0 Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Fri, 26 Jan 2018 12:22:05 -0500 Subject: [PATCH 119/174] simplifying stuff --- egs/swbd/s5c/local/run_asr_segmentation.sh | 78 ++++++++++------------ 1 file changed, 37 insertions(+), 41 deletions(-) diff --git a/egs/swbd/s5c/local/run_asr_segmentation.sh b/egs/swbd/s5c/local/run_asr_segmentation.sh index 21c20b0a423..4bc43007aca 100755 --- a/egs/swbd/s5c/local/run_asr_segmentation.sh +++ b/egs/swbd/s5c/local/run_asr_segmentation.sh @@ -10,7 +10,7 @@ lang=data/lang # Must match the one used to train the models lang_test=data/lang_nosp_sw1_tg # Lang directory for decoding. -data_dir=data/train +data_dir=data/train_nodup # Model directory used to align the $data_dir to get target labels for training # SAD. This should typically be a speaker-adapted system. sat_model_dir=exp/tri4 @@ -18,15 +18,8 @@ sat_model_dir=exp/tri4 # get target labels for training SAD. This should typically be a # speaker-independent system like LDA+MLLT system. model_dir=exp/tri3 -graph_dir= # If not provided, a new one will be created using $lang_test - -# Uniform segmentation options for decoding whole recordings. All values are in -# seconds. -max_segment_duration=10 -overlap_duration=2.5 -max_remaining_duration=5 # If the last remaining piece when splitting uniformly - # is smaller than this duration, then the last piece - # is merged with the previous. +graph_dir= # Graph for decoding whole-recording version of $data_dir. + # If not provided, a new one will be created using $lang_test # List of weights on labels obtained from alignment, # labels obtained from decoding and default labels in out-of-segment regions @@ -37,7 +30,7 @@ nstage=-10 train_stage=-10 test_stage=-10 num_data_reps=2 -affix=_1a +affix=_1a # For segmentation stage=-1 nj=80 @@ -77,7 +70,10 @@ if ! cat $dir/garbage_phones.txt $dir/silence_phones.txt | \ fi whole_data_dir=${data_dir}_whole -rvb_data_dir=${whole_data_dir}_rvb +targets_dir=exp/segmentation${affix}/train_whole_combined_targets_sub3 + +rvb_data_dir=${whole_data_dir}_rvb_hires +rvb_targets_dir=${targets_dir}_rvb if [ $stage -le 0 ]; then utils/data/convert_data_dir_to_whole.sh $data_dir $whole_data_dir @@ -112,15 +108,16 @@ if [ $stage -le 3 ]; then --nj 80 --reco-nj 40 --lang-test $lang_test \ --garbage-phones-list $dir/garbage_phones.txt \ --silence-phones-list $dir/silence_phones.txt \ - --merge-weights $merge_weights \ + --merge-weights "$merge_weights" \ + --graph-dir "$graph_dir" \ $lang $data_dir $whole_data_dir $sat_model_dir $model_dir $dir fi if [ $stage -le 4 ]; then # Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises if [ ! -f rirs_noises.zip ]; then - wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip - unzip rirs_noises.zip + wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip + unzip rirs_noises.zip fi rvb_opts=() @@ -131,7 +128,6 @@ if [ $stage -le 4 ]; then foreground_snrs="20:10:15:5:0" background_snrs="20:10:15:5:0" - num_data_reps=1 # corrupt the data to generate multi-condition data # for data_dir in train dev test; do python steps/data/reverberate_data_dir.py \ @@ -147,70 +143,70 @@ if [ $stage -le 4 ]; then --source-sampling-rate 8000 \ $whole_data_dir $rvb_data_dir - rvb_dirs=() + rvb_targets_dirs=() for i in `seq 1 $num_data_reps`; do steps/segmentation/copy_targets_dir.sh --utt-prefix "rev${i}_" \ - exp/segmentation_1a/train_whole_combined_targets_sub3 \ - exp/segmentation_1a/train_whole_combined_targets_sub3_temp_$i || exit 1; - rvb_dirs+=(exp/segmentation_1a/train_whole_combined_targets_sub3_temp_$i) + $targets_dir ${targets_dir}_temp_$i || exit 1 + rvb_targets_dirs+=(${targets_dir}_temp_$i) done steps/segmentation/combine_targets_dirs.sh \ - $rvb_data_dir exp/segmentation_1a/train_whole_combined_targets_sub3_rvb \ - $rvb_dirs || exit 1; + $rvb_data_dir ${rvb_targets_dir} \ + ${rvb_targets_dirs[@]} || exit 1; + + rm -r ${rvb_targets_dirs[@]} fi if [ $stage -le 5 ]; then - utils/copy_data_dir.sh ${rvb_data_dir} ${rvb_data_dir}_hires steps/make_mfcc.sh --mfcc-config conf/mfcc_hires.conf --nj 80 \ - ${rvb_data_dir}_hires - steps/compute_cmvn_stats.sh ${rvb_data_dir}_hires + ${rvb_data_dir} + steps/compute_cmvn_stats.sh ${rvb_data_dir} fi if [ $stage -le 6 ]; then # Train a STATS-pooling network for SAD - local/segmentation/tuning/train_stats_asr_sad_1a.sh \ + local/segmentation/tuning/train_stats_asr_sad_1a.sh \ --stage $nstage --train-stage $train_stage \ - --targets-dir exp/segmentation_1a/train_whole_combined_targets_sub3_rvb \ - --data-dir ${rvb_data_dir}_hires + --targets-dir ${rvb_targets_dir} \ + --data-dir ${rvb_data_dir} --affix "1a" || exit 1 fi if [ $stage -le 7 ]; then # The options to this script must match the options used in the # nnet training script. - # e.g. extra-left-context is 70, because the model is an LSTM trained with a - # chunk-left-context of 60. + # e.g. extra-left-context is 79, because the model is an stats pooling network + # trained with a chunk-left-context of 79 and chunk-right-context of 21. # Note: frames-per-chunk is 150 even though the model was trained with # chunk-width of 20. This is just for speed. # See the script for details of the options. steps/segmentation/detect_speech_activity.sh \ - --extra-left-context 70 --extra-right-context 0 --frames-per-chunk 150 \ + --extra-left-context 79 --extra-right-context 21 --frames-per-chunk 150 \ --extra-left-context-initial 0 --extra-right-context-final 0 \ --nj 32 --acwt 0.3 --stage $test_stage \ data/eval2000 \ - exp/segmentation_1a/tdnn_stats_asr_sad_1a \ + exp/segmentation${affix}/tdnn_stats_asr_sad_1a \ mfcc_hires \ - exp/segmentation_1a/tdnn_stats_asr_sad_1a/{,eval2000} + exp/segmentation${affix}/tdnn_stats_asr_sad_1a/{,eval2000} fi if [ $stage -le 8 ]; then # Do some diagnostics steps/segmentation/evaluate_segmentation.pl data/eval2000/segments \ - exp/segmentation_1a/tdnn_stats_asr_sad_1a/eval2000_seg/segments &> \ - exp/segmentation_1a/tdnn_stats_asr_sad_1a/eval2000_seg/evalutate_segmentation.log + exp/segmentation${affix}/tdnn_stats_asr_sad_1a/eval2000_seg/segments &> \ + exp/segmentation${affix}/tdnn_stats_asr_sad_1a/eval2000_seg/evalutate_segmentation.log steps/segmentation/convert_utt2spk_and_segments_to_rttm.py \ - exp/segmentation_1a/tdnn_stats_asr_sad_1a/eval2000_seg/utt2spk \ - exp/segmentation_1a/tdnn_stats_asr_sad_1a/eval2000_seg/segments \ - exp/segmentation_1a/tdnn_stats_asr_sad_1a/eval2000_seg/sys.rttm + exp/segmentation${affix}/tdnn_stats_asr_sad_1a/eval2000_seg/utt2spk \ + exp/segmentation${affix}/tdnn_stats_asr_sad_1a/eval2000_seg/segments \ + exp/segmentation${affix}/tdnn_stats_asr_sad_1a/eval2000_seg/sys.rttm # export PATH=$PATH:$KALDI_ROOT/tools/sctk/bin # md-eval.pl -c 0.25 -r $eval2000_rttm_file \ -# -s exp/segmentation_1a/tdnn_stats_asr_sad_1a/eval2000_seg/sys.rttm > \ -# exp/segmentation_1a/tdnn_stats_asr_sad_1a/eval2000_seg/md_eval.log +# -s exp/segmentation${affix}/tdnn_stats_asr_sad_1a/eval2000_seg/sys.rttm > \ +# exp/segmentation${affix}/tdnn_stats_asr_sad_1a/eval2000_seg/md_eval.log fi if [ $stage -le 9 ]; then - utils/copy_data_dir.sh exp/segmentation_1a/tdnn_stats_asr_sad_1a/eval2000_seg \ + utils/copy_data_dir.sh exp/segmentation${affix}/tdnn_stats_asr_sad_1a/eval2000_seg \ data/eval2000.seg_asr_sad_1a fi From d3c11fb8e898af63a39fc656e9a0ecb78a89cf87 Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Fri, 2 Feb 2018 12:00:54 -0500 Subject: [PATCH 120/174] Modifying the way mmi and smbr are done in lattice-free --- .../s5/steps/libs/nnet3/report/log_parse.py | 79 +++++++++++-------- .../nnet3/train/chain_objf/acoustic_model.py | 3 + .../libs/nnet3/train/dropout_schedule.py | 2 +- egs/wsj/s5/steps/nnet3/chain/train.py | 15 ++++ .../s5/steps/nnet3/report/generate_plots.py | 21 +++-- src/chain/chain-denominator-smbr.cc | 77 ++++++++++++------ src/chain/chain-denominator-smbr.h | 5 +- src/chain/chain-kernels-ansi.h | 4 +- src/chain/chain-smbr-kernels.cu | 22 ++++-- src/chain/chain-supervision.cc | 2 +- src/chain/chain-training.cc | 26 +++++- src/chain/chain-training.h | 9 ++- src/chain/language-model.cc | 11 ++- src/chain/language-model.h | 4 +- src/chainbin/nnet3-chain-combine.cc | 13 +-- 15 files changed, 201 insertions(+), 92 deletions(-) diff --git a/egs/wsj/s5/steps/libs/nnet3/report/log_parse.py b/egs/wsj/s5/steps/libs/nnet3/report/log_parse.py index 65d0bcd14f3..931e643b84e 100755 --- a/egs/wsj/s5/steps/libs/nnet3/report/log_parse.py +++ b/egs/wsj/s5/steps/libs/nnet3/report/log_parse.py @@ -334,7 +334,8 @@ def get_train_times(exp_dir): return train_times -def parse_prob_logs(exp_dir, key='accuracy', output="output", field=0): +def parse_prob_logs(exp_dir, key='accuracy', output="output", + get_smbr_objf=False): train_prob_files = "%s/log/compute_prob_train.*.log" % (exp_dir) valid_prob_files = "%s/log/compute_prob_valid.*.log" % (exp_dir) train_prob_strings = common_lib.get_command_stdout( @@ -352,52 +353,60 @@ def parse_prob_logs(exp_dir, key='accuracy', output="output", field=0): # Overall log-probability for 'output' is -0.307255 per frame, over 20000 # frames. - if field == 0: - parse_regex = re.compile( - ".*compute_prob_.*\.([0-9]+).log:LOG " - ".nnet3.*compute-prob.*:PrintTotalStats..:" - "nnet.*diagnostics.cc:[0-9]+. Overall ([a-zA-Z\-]+) for " - "'{output}'.*is ([0-9.\-e]+) .*per frame".format(output=output)) - else: - other_objfs_str = "" - for i in range(field): - other_objfs_str += "[0-9.\-e]+ [+] "; - - logger.info(".*compute_prob_.*\.([0-9]+).log:LOG " - ".nnet3.*compute-prob.*:PrintTotalStats..:" - "nnet.*diagnostics.cc:[0-9]+. Overall ([a-zA-Z\-]+) for " - "'{output}'.*is {other_objfs}([0-9.\-e]+) .*per frame".format( - output=output, other_objfs=other_objfs_str)) - parse_regex = re.compile( - ".*compute_prob_.*\.([0-9]+).log:LOG " - ".nnet3.*compute-prob.*:PrintTotalStats..:" - "nnet.*diagnostics.cc:[0-9]+. Overall ([a-zA-Z\-]+) for " - "'{output}'.*is {other_objfs}([0-9.\-e]+) .*per frame".format( - output=output, other_objfs=other_objfs_str)) + parse_regex = re.compile( + ".*compute_prob_.*\.([0-9]+).log:LOG " + ".nnet3.*compute-prob.*:PrintTotalStats..:" + "nnet.*diagnostics.cc:[0-9]+. Overall ([a-zA-Z\-]+) for " + "'{output}'.*is ([0-9.\-e]+) .*per frame".format(output=output)) + + other_objfs_str = "" + for i in range(2): + other_objfs_str += "[0-9.\-e]+ [+] "; + smbr_parse_regex = re.compile( + ".*compute_prob_.*\.([0-9]+).log:LOG " + ".nnet3.*compute-prob.*:PrintTotalStats..:" + "nnet.*diagnostics.cc:[0-9]+. Overall ([a-zA-Z\-]+) for " + "'{output}'.*is {other_objfs}([0-9.\-e]+) .*per frame".format( + output=output, other_objfs=other_objfs_str)) train_objf = {} valid_objf = {} for line in train_prob_strings.split('\n'): mat_obj = parse_regex.search(line) - if mat_obj is not None: + mmi_mat_obj = smbr_parse_regex.search(line) + + if mmi_mat_obj is not None: # This is SMBR training + groups = (mat_obj.groups() if get_smbr_objf + else mmi_mat_obj.groups()) + elif mat_obj is not None and not get_smbr_objf: # This is normal chain training groups = mat_obj.groups() - if groups[1] == key: - train_objf[int(groups[0])] = groups[2] + else: + continue + + if groups[1] == key: + train_objf[int(groups[0])] = groups[2] if not train_objf: - raise KaldiLogParseException("Could not find any values at field {f} with {k} in " - " {l}".format(f=field, k=key, l=train_prob_files)) + raise KaldiLogParseException("Could not find any values with {k} in " + " {l}".format(k=key, l=train_prob_files)) for line in valid_prob_strings.split('\n'): mat_obj = parse_regex.search(line) - if mat_obj is not None: + mmi_mat_obj = smbr_parse_regex.search(line) + + if mmi_mat_obj is not None: # This is SMBR training + groups = (mat_obj.groups() if get_smbr_objf + else mmi_mat_obj.groups()) + elif mat_obj is not None: # This is normal chain training groups = mat_obj.groups() - if groups[1] == key: - valid_objf[int(groups[0])] = groups[2] + else: + continue + if groups[1] == key: + valid_objf[int(groups[0])] = groups[2] if not valid_objf: - raise KaldiLogParseException("Could not find any values at field {f} with {k} in " - " {l}".format(f=field, k=key, l=valid_prob_files)) + raise KaldiLogParseException("Could not find any values at with {k} in " + " {l}".format(k=key, l=valid_prob_files)) iters = list(set(valid_objf.keys()).intersection(train_objf.keys())) if not iters: @@ -410,7 +419,7 @@ def parse_prob_logs(exp_dir, key='accuracy', output="output", field=0): -def generate_acc_logprob_report(exp_dir, key="accuracy", output="output", field=0): +def generate_acc_logprob_report(exp_dir, key="accuracy", output="output", get_smbr_objf=False): try: times = get_train_times(exp_dir) except: @@ -421,7 +430,7 @@ def generate_acc_logprob_report(exp_dir, key="accuracy", output="output", field= report = [] report.append("%Iter\tduration\ttrain_objective\tvalid_objective\tdifference") try: - data = list(parse_prob_logs(exp_dir, key, output, field)) + data = list(parse_prob_logs(exp_dir, key, output, get_smbr_objf)) except: tb = traceback.format_exc() logger.warning("Error getting info from logs, exception was: " + tb) diff --git a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py index 390ca2ee474..3fb7fe14e76 100644 --- a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py +++ b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py @@ -506,6 +506,9 @@ def compute_train_cv_probabilities(dir, iter, egs_dir, l2_regularize, egs_prefix="valid_diagnostic.", use_multitask_egs=use_multitask_egs) + import re + objective_opts = re.sub(r"--mmi-factor=0.0 ", "--mmi-factor=1e-10 ", + objective_opts) common_lib.background_command( """{command} {dir}/log/compute_prob_valid.{iter}.log \ diff --git a/egs/wsj/s5/steps/libs/nnet3/train/dropout_schedule.py b/egs/wsj/s5/steps/libs/nnet3/train/dropout_schedule.py index f94d95136c4..9d3934873d4 100644 --- a/egs/wsj/s5/steps/libs/nnet3/train/dropout_schedule.py +++ b/egs/wsj/s5/steps/libs/nnet3/train/dropout_schedule.py @@ -123,7 +123,7 @@ def _parse_dropout_string(dropout_str): dropout_values.reverse() for data_fraction, proportion in dropout_values: assert data_fraction <= 1.0 and data_fraction >= 0.0 - assert proportion <= 1.0 and proportion >= 0.0 + #assert proportion <= 1.0 and proportion >= 0.0 return dropout_values diff --git a/egs/wsj/s5/steps/nnet3/chain/train.py b/egs/wsj/s5/steps/nnet3/chain/train.py index e7a01871548..71a11879977 100755 --- a/egs/wsj/s5/steps/nnet3/chain/train.py +++ b/egs/wsj/s5/steps/nnet3/chain/train.py @@ -58,6 +58,10 @@ def get_args(): should halve --trainer.samples-per-iter. May be a comma-separated list of alternatives: first width is the 'principal' chunk-width, used preferentially""") + parser.add_argument("--egs.get-egs-script", type=str, + dest='get_egs_script', + default='steps/nnet3/chain/get_egs.sh', + help="Script for creating egs") # chain options parser.add_argument("--chain.lm-opts", type=str, dest='lm_opts', @@ -74,6 +78,14 @@ def get_args(): dest='xent_regularize', default=0.0, help="Weight of regularization function which is the " "cross-entropy cost the outputs.") + parser.add_argument("--chain.norm-regularize", type=str, + dest='norm_regularize', default=False, + action=common_lib.StrToBoolAction, + choices=["true", "false"], + help="""If true, instead of l2-regularization on + output of the network, we use l1-regularization on + exp(output) of the network. This tends to make + exp(output) more like probabilities.""") parser.add_argument("--chain.right-tolerance", type=int, dest='right_tolerance', default=5, help="") parser.add_argument("--chain.left-tolerance", type=int, @@ -585,6 +597,9 @@ def train(args, run_opts): objective_opts += " --mmi-factor={0}".format(mmi_factor) + objective_opts += " --norm-regularize={0}".format( + "true" if args.norm_regularize else "false") + percent = num_archives_processed * 100.0 / num_archives_to_process epoch = (num_archives_processed * args.num_epochs / num_archives_to_process) diff --git a/egs/wsj/s5/steps/nnet3/report/generate_plots.py b/egs/wsj/s5/steps/nnet3/report/generate_plots.py index 7d9aad94fcc..ad9e5cf328c 100755 --- a/egs/wsj/s5/steps/nnet3/report/generate_plots.py +++ b/egs/wsj/s5/steps/nnet3/report/generate_plots.py @@ -158,7 +158,8 @@ def latex_compliant_name(name_string): def generate_acc_logprob_plots(exp_dir, output_dir, plot, key='accuracy', file_basename='accuracy', comparison_dir=None, - start_iter=1, latex_report=None, output_name='output', field=0): + start_iter=1, latex_report=None, output_name='output', + get_smbr_objf=False): assert start_iter >= 1 @@ -171,7 +172,8 @@ def generate_acc_logprob_plots(exp_dir, output_dir, plot, key='accuracy', index = 0 for dir in dirs: [report, times, data] = log_parse.generate_acc_logprob_report(dir, key, - output_name, field) + output_name, + get_smbr_objf=get_smbr_objf) if index == 0: # this is the main experiment directory @@ -183,8 +185,13 @@ def generate_acc_logprob_plots(exp_dir, output_dir, plot, key='accuracy', color_val = g_plot_colors[index] data = np.array(data) if data.shape[0] == 0: - logger.warning("Couldn't find any rows for the" - "accuracy/log-probability plot, not generating it") + logger.warning("Couldn't find any data for the" + "%s plot of output '%s' " + "for %s, " + "not generating it", + "smbr" if get_smbr_objf else key, + output_name, dir) + continue data = data[data[:, 0] >= start_iter, :] plot_handle, = plt.plot(data[:, 0], data[:, 1], color=color_val, @@ -692,15 +699,15 @@ def generate_plots(exp_dir, output_dir, output_names, comparison_dir=None, elif objective_type == "chain-smbr": generate_acc_logprob_plots( exp_dir, output_dir, g_plot, - key='log-probability', file_basename='smbr', + key='log-probability', file_basename='log_probability', comparison_dir=comparison_dir, start_iter=start_iter, latex_report=latex_report, output_name=output_name) generate_acc_logprob_plots( exp_dir, output_dir, g_plot, - key='log-probability', file_basename='log_probability', + key='log-probability', file_basename='smbr', comparison_dir=comparison_dir, start_iter=start_iter, latex_report=latex_report, output_name=output_name, - field=2) + get_smbr_objf=True) else: logger.info("Generating " + objective_type + " objective plots") generate_acc_logprob_plots( diff --git a/src/chain/chain-denominator-smbr.cc b/src/chain/chain-denominator-smbr.cc index ab865688c91..0e0e895dd8f 100644 --- a/src/chain/chain-denominator-smbr.cc +++ b/src/chain/chain-denominator-smbr.cc @@ -41,6 +41,11 @@ DenominatorSmbrComputation::DenominatorSmbrComputation( std::min(exp_nnet_output_transposed_.NumCols(), static_cast(kMaxDerivTimeSteps) * num_sequences_)), + nnet_output_mmi_deriv_transposed_( + exp_nnet_output_transposed_.NumRows(), + std::min(exp_nnet_output_transposed_.NumCols(), + static_cast(kMaxDerivTimeSteps) * + num_sequences_)), alpha_(frames_per_sequence_ + 1, den_graph_.NumStates() * num_sequences_ + num_sequences_, kUndefined), @@ -374,11 +379,30 @@ bool DenominatorSmbrComputation::BackwardSmbr( nnet_output_deriv_transposed_, 0, num_pdfs, 0, chunk_frames * num_sequences_); + CuSubMatrix transposed_mmi_deriv_part( + nnet_output_mmi_deriv_transposed_, + 0, num_pdfs, + 0, chunk_frames * num_sequences_); CuSubMatrix output_deriv_part( *nnet_output_deriv, t * num_sequences_, chunk_frames * num_sequences_, 0, num_pdfs); - output_deriv_part.AddMat(deriv_weight, transposed_deriv_part, kTrans); + output_deriv_part.AddMat(deriv_weight * opts_.smbr_factor, + transposed_deriv_part, kTrans); + output_deriv_part.AddMat(-deriv_weight * opts_.mmi_factor, + transposed_mmi_deriv_part, kTrans); + + if (GetVerboseLevel() >= 2) { + CuVector deriv_sum(num_pdfs); + deriv_sum.AddColSumMat(1.0, transposed_deriv_part, 0.0); + CuVector mmi_deriv_sum(num_pdfs); + mmi_deriv_sum.AddColSumMat(1.0, transposed_mmi_deriv_part, 0.0); + + deriv_sum.Write(KALDI_LOG, false); + mmi_deriv_sum.Write(KALDI_LOG, false); + } + + if (t != 0) transposed_deriv_part.SetZero(); } @@ -436,7 +460,9 @@ void DenominatorSmbrComputation::BetaSmbrDashGeneralFrame(int32 t) { numerator_post(numerator_posteriors_transposed_, 0, num_pdfs, t * num_sequences_, num_sequences_), log_prob_deriv(nnet_output_deriv_transposed_, 0, num_pdfs, - t_wrapped * num_sequences_, num_sequences_); + t_wrapped * num_sequences_, num_sequences_), + log_prob_mmi_deriv(nnet_output_mmi_deriv_transposed_, 0, num_pdfs, + t_wrapped * num_sequences_, num_sequences_); int32 num_hmm_states = den_graph_.NumStates(), num_sequences = num_sequences_; @@ -459,7 +485,7 @@ void DenominatorSmbrComputation::BetaSmbrDashGeneralFrame(int32 t) { next_beta, next_beta_smbr, this_beta_dash, this_beta_smbr, log_prob_deriv.Data(), log_prob_deriv.Stride(), - opts_.mmi_factor, opts_.smbr_factor); + log_prob_mmi_deriv.Data(), log_prob_mmi_deriv.Stride()); CU_SAFE_CALL(cudaGetLastError()); if (dimGrid.y == num_hmm_states) { break; // this is the normal case. @@ -481,10 +507,12 @@ void DenominatorSmbrComputation::BetaSmbrDashGeneralFrame(int32 t) { { int32 prob_stride = probs.Stride(), post_stride = numerator_post.Stride(), - deriv_stride = log_prob_deriv.Stride(); + deriv_stride = log_prob_deriv.Stride(), + mmi_deriv_stride = log_prob_mmi_deriv.Stride(); const BaseFloat *prob_data = probs.Data(); const BaseFloat *post_data = numerator_post.Data(); BaseFloat *log_prob_deriv_data = log_prob_deriv.Data(); + BaseFloat *log_prob_mmi_deriv_data = log_prob_mmi_deriv.Data(); for (int32 h = 0; h < num_hmm_states; h++) { for (int32 s = 0; s < num_sequences; s++) { BaseFloat this_alpha_dash_prob = this_alpha_dash[h * num_sequences + s], @@ -508,11 +536,13 @@ void DenominatorSmbrComputation::BetaSmbrDashGeneralFrame(int32 t) { variable_factor = transition_prob * next_beta_j * prob; tot_beta_smbr += (next_beta_smbr_j + post) * variable_factor; tot_variable_factor += variable_factor; - double this_gamma_r = occupation_factor * variable_factor * + BaseFloat occupation_prob = occupation_factor * variable_factor; + double this_gamma_r = occupation_prob * (this_alpha_smbr_i + post + next_beta_smbr_j - tot_smbr_(s)); log_prob_deriv_data[pdf_id * deriv_stride + s] += - opts_.smbr_factor * this_gamma_r - - opts_.mmi_factor * occupation_factor; + this_gamma_r; + log_prob_mmi_deriv_data[pdf_id * mmi_deriv_stride + s] += + occupation_prob; } this_beta_dash[h * num_sequences + s] = tot_variable_factor / inv_arbitrary_scale; @@ -538,9 +568,12 @@ void DenominatorSmbrComputation::BetaSmbrGeneralFrameDebug(int32 t) { CuSubMatrix this_log_prob_deriv( nnet_output_deriv_transposed_, 0, num_pdfs, t_wrapped * num_sequences_, num_sequences_); + CuSubMatrix this_log_prob_mmi_deriv( + nnet_output_mmi_deriv_transposed_, 0, num_pdfs, + t_wrapped * num_sequences_, num_sequences_); BaseFloat alpha_beta_product = VecVec(this_alpha_dash, this_beta_dash), - this_log_prob_deriv_sum = this_log_prob_deriv.Sum(); + this_log_prob_mmi_deriv_sum = this_log_prob_mmi_deriv.Sum(); if (!ApproxEqual(alpha_beta_product, num_sequences_)) { KALDI_WARN << "On time " << t << ", alpha-beta product " << alpha_beta_product << " != " << num_sequences_ @@ -563,10 +596,10 @@ void DenominatorSmbrComputation::BetaSmbrGeneralFrameDebug(int32 t) { CuVector alpha_beta_vec(this_alpha_dash); alpha_beta_vec.MulElements(this_beta_dash); - + alpha_beta_smbr_vec.MulElements(alpha_beta_vec); - BaseFloat alpha_beta_smbr_sum = alpha_beta_smbr_vec.Sum() + BaseFloat alpha_beta_smbr_sum = alpha_beta_smbr_vec.Sum() / alpha_beta_product * num_sequences_, tot_smbr_sum = tot_smbr_.Sum(); KALDI_ASSERT (alpha_beta_smbr_sum - alpha_beta_smbr_sum == 0.0); @@ -578,18 +611,18 @@ void DenominatorSmbrComputation::BetaSmbrGeneralFrameDebug(int32 t) { << alpha_beta_smbr_sum << " = tot-smbr-sum"; } - //// use higher tolerance, since we are using randomized pruning for the - //// log-prob derivatives. - //if (GetVerboseLevel() > 1 || !ApproxEqual( - // this_log_prob_deriv_sum, -opts_.mmi_factor * num_sequences_, 0.01)) { - // KALDI_WARN << "On time " << t << ", log-prob-deriv sum " - // << this_log_prob_deriv_sum << " != " - // << -opts_.mmi_factor * num_sequences_; - // if (fabs(this_log_prob_deriv_sum + opts_.mmi_factor * num_sequences_) > 2.0) { - // KALDI_WARN << "Excessive error detected, will abandon this minibatch"; - // ok_ = false; - // } - //} + // use higher tolerance, since we are using randomized pruning for the + // log-prob derivatives. + if (GetVerboseLevel() > 1 || !ApproxEqual( + this_log_prob_mmi_deriv_sum, num_sequences_, 0.01)) { + KALDI_WARN << "On time " << t << ", log-prob-mmi-deriv sum " + << this_log_prob_mmi_deriv_sum << " != " + << num_sequences_; + if (fabs(this_log_prob_mmi_deriv_sum - num_sequences_) > 2.0) { + KALDI_WARN << "Excessive error detected, will abandon this minibatch"; + ok_ = false; + } + } } diff --git a/src/chain/chain-denominator-smbr.h b/src/chain/chain-denominator-smbr.h index cebf63ccd40..b27415f98cc 100644 --- a/src/chain/chain-denominator-smbr.h +++ b/src/chain/chain-denominator-smbr.h @@ -298,9 +298,12 @@ class DenominatorSmbrComputation { // num_sequences + sequence_index). CuMatrix numerator_posteriors_transposed_; - // the derivs w.r.t. the nnet outputs (transposed) + // the smbr derivs w.r.t. the nnet outputs (transposed) CuMatrix nnet_output_deriv_transposed_; + // the mmi derivs w.r.t. the nnet outputs (transposed) + CuMatrix nnet_output_mmi_deriv_transposed_; + // the (temporarily) alpha and (more permanently) alpha-dash probabilities; // dimension is (frames_per_sequence + 1) by (num-hmm-states * num-sequences + // num_sequences). Note, they are not logs. The last 'num_sequences' diff --git a/src/chain/chain-kernels-ansi.h b/src/chain/chain-kernels-ansi.h index 3515725cdcb..c772ce10197 100644 --- a/src/chain/chain-kernels-ansi.h +++ b/src/chain/chain-kernels-ansi.h @@ -66,8 +66,8 @@ extern "C" { BaseFloat *this_beta_smbr, BaseFloat *log_prob_deriv, int32_cuda log_prob_deriv_stride, - BaseFloat mmi_factor, - BaseFloat smbr_factor); + BaseFloat *log_prob_mmi_deriv, + int32_cuda log_prob_mmi_deriv_stride); void cuda_chain_smbr_hmm_forward(dim3 Gr, dim3 Bl, const Int32Pair *backward_transitions, diff --git a/src/chain/chain-smbr-kernels.cu b/src/chain/chain-smbr-kernels.cu index a1804149939..04c33aecf18 100644 --- a/src/chain/chain-smbr-kernels.cu +++ b/src/chain/chain-smbr-kernels.cu @@ -217,7 +217,7 @@ static void _cuda_chain_smbr_hmm_backward( const BaseFloat *next_beta, const BaseFloat *next_beta_smbr, BaseFloat *this_beta, BaseFloat *this_beta_smbr, BaseFloat *log_prob_deriv, int32_cuda log_prob_deriv_stride, - BaseFloat mmi_factor, BaseFloat smbr_factor) { + BaseFloat *log_prob_mmi_deriv, int32_cuda log_prob_mmi_deriv_stride) { // 'forward_transitions', indexed by hmm-state, consists of [start, end] // indexes into the 'transition_info' array. This is about the transitions // *out of* this state. 'probs' contains the exponentiated neural net @@ -280,12 +280,16 @@ static void _cuda_chain_smbr_hmm_backward( BaseFloat this_gamma_r0 = occupation_prob0 * (this_alpha_smbr_i + num_post0 + next_beta_smbr_j0 - tot_smbr[s]); atomic_add(log_prob_deriv + (pdf_id0 * log_prob_deriv_stride + s), - smbr_factor * this_gamma_r0 - mmi_factor * occupation_prob0); + this_gamma_r0); + atomic_add(log_prob_mmi_deriv + (pdf_id0 * log_prob_mmi_deriv_stride + s), + occupation_prob0); BaseFloat occupation_prob1 = variable_factor1 * occupation_factor; BaseFloat this_gamma_r1 = occupation_prob1 * (this_alpha_smbr_i + num_post1 + next_beta_smbr_j1 - tot_smbr[s]); atomic_add(log_prob_deriv + (pdf_id1 * log_prob_deriv_stride + s), - smbr_factor * this_gamma_r1 - mmi_factor * occupation_prob1); + this_gamma_r1); + atomic_add(log_prob_mmi_deriv + (pdf_id1 * log_prob_mmi_deriv_stride + s), + occupation_prob1); } if (trans_iter != trans_end) { // mop up the odd transition. @@ -303,7 +307,9 @@ static void _cuda_chain_smbr_hmm_backward( BaseFloat this_gamma_r0 = occupation_prob0 * (this_alpha_smbr_i + num_post0 + next_beta_smbr_j0 - tot_smbr[s]); atomic_add(log_prob_deriv + (pdf_id0 * log_prob_deriv_stride + s), - smbr_factor * this_gamma_r0 - mmi_factor * occupation_prob0); + this_gamma_r0); + atomic_add(log_prob_mmi_deriv + (pdf_id0 * log_prob_mmi_deriv_stride + s), + occupation_prob0); } BaseFloat beta = tot_variable_factor / inv_arbitrary_scale; this_beta[h * num_sequences + s] = beta; @@ -347,12 +353,14 @@ void cuda_chain_smbr_hmm_backward( BaseFloat *this_beta, BaseFloat *this_beta_smbr, BaseFloat *log_prob_deriv, int32_cuda log_prob_deriv_stride, - BaseFloat mmi_factor, BaseFloat smbr_factor) { + BaseFloat *log_prob_mmi_deriv, + int32_cuda log_prob_mmi_deriv_stride) { _cuda_chain_smbr_hmm_backward<<>>( forward_transitions, transitions, num_sequences, num_hmm_states, probs, prob_stride, num_post, post_stride, tot_smbr, this_alpha, this_alpha_smbr, next_beta, next_beta_smbr, - this_beta, this_beta_smbr, log_prob_deriv, - log_prob_deriv_stride, mmi_factor, smbr_factor); + this_beta, this_beta_smbr, + log_prob_deriv, log_prob_deriv_stride, + log_prob_mmi_deriv, log_prob_mmi_deriv_stride); } diff --git a/src/chain/chain-supervision.cc b/src/chain/chain-supervision.cc index 6eab1059707..ff706190384 100644 --- a/src/chain/chain-supervision.cc +++ b/src/chain/chain-supervision.cc @@ -76,7 +76,7 @@ void ProtoSupervision::Write(std::ostream &os, bool binary) const { void SupervisionOptions::Check() const { KALDI_ASSERT(left_tolerance >= 0 && right_tolerance >= 0 && frame_subsampling_factor > 0 && - left_tolerance + right_tolerance + 1 >= frame_subsampling_factor); + (left_tolerance + right_tolerance + 1 >= frame_subsampling_factor || (left_tolerance == 0 && right_tolerance == 0))); KALDI_ASSERT(lm_scale >= 0.0 && lm_scale < 1.0); diff --git a/src/chain/chain-training.cc b/src/chain/chain-training.cc index 84e9c75fe00..e99d32228a2 100644 --- a/src/chain/chain-training.cc +++ b/src/chain/chain-training.cc @@ -22,6 +22,7 @@ #include "chain/chain-numerator.h" #include "chain/chain-denominator.h" #include "chain/chain-denominator-smbr.h" +#include "hmm/posterior.h" namespace kaldi { namespace chain { @@ -137,6 +138,21 @@ void ComputeChainSmbrObjfAndDeriv(const ChainTrainingOptions &opts, // the numerator object, and the logprob too. num_logprob_weighted = opts.mmi_factor * numerator.Forward(); numerator.Backward(&num_posteriors); +#if HAVE_CUDA == 1 + if (!CuDevice::Instantiate().Enabled() && GetVerboseLevel() >= 2) { + Posterior post(num_posteriors.NumRows()); + for (int32 i = 0; i < num_posteriors.NumRows(); i++) { + CuSubVector row(num_posteriors, i); + for (int32 j = 0; j < row.Dim(); j++) { + BaseFloat p = row(j); + if (p >= 0.01) { + post[i].push_back(std::make_pair(j, p)); + } + } + } + PosteriorHolder::Write(KALDI_LOG, false, post); + } +#endif if (nnet_output_deriv && opts.mmi_factor != 0.0) { nnet_output_deriv->CopyFromMat(num_posteriors); @@ -229,12 +245,20 @@ void ComputeChainSmbrObjfAndDeriv(const ChainTrainingOptions &opts, if (opts.l2_regularize == 0.0) { *l2_term = 0.0; - } else { + } else if (!opts.norm_regularize) { // compute the l2 penalty term and its derivative BaseFloat scale = supervision.weight * opts.l2_regularize; *l2_term = -0.5 * scale * TraceMatMat(nnet_output, nnet_output, kTrans); if (nnet_output_deriv) nnet_output_deriv->AddMat(-1.0 * scale, nnet_output); + } else { + // compute the l2 penalty term and its derivative + BaseFloat scale = supervision.weight * opts.l2_regularize; + CuMatrix exp_nnet_output(nnet_output); + exp_nnet_output.ApplyExp(); + *l2_term = -scale * exp_nnet_output.Sum(); + if (nnet_output_deriv) + nnet_output_deriv->AddMat(-1.0 * scale, exp_nnet_output); } } diff --git a/src/chain/chain-training.h b/src/chain/chain-training.h index 46ddbfa2228..ebb2d9298f9 100644 --- a/src/chain/chain-training.h +++ b/src/chain/chain-training.h @@ -70,15 +70,22 @@ struct ChainTrainingOptions { BaseFloat mmi_factor; BaseFloat smbr_factor; + bool norm_regularize; + ChainTrainingOptions(): l2_regularize(0.0), leaky_hmm_coefficient(1.0e-05), xent_regularize(0.0), use_smbr_objective(false), exclude_silence(false), one_silence_class(false), - mmi_factor(0.0), smbr_factor(1.0) { } + mmi_factor(0.0), smbr_factor(1.0), + norm_regularize(false) { } void Register(OptionsItf *opts) { opts->Register("l2-regularize", &l2_regularize, "l2 regularization " "constant for 'chain' training, applied to the output " "of the neural net."); + opts->Register("norm-regularize", &norm_regularize, + "If true, then use l1 regularization on exponential of the " + "output of the neural net. Tends to make the " + "exp(output) small and more like probabilities."); opts->Register("leaky-hmm-coefficient", &leaky_hmm_coefficient, "Coefficient " "that allows transitions from each HMM state to each other " "HMM state, to ensure gradual forgetting of context (can " diff --git a/src/chain/language-model.cc b/src/chain/language-model.cc index d2bb073d764..c8900726b00 100644 --- a/src/chain/language-model.cc +++ b/src/chain/language-model.cc @@ -26,8 +26,7 @@ namespace kaldi { namespace chain { -void LanguageModelEstimator::AddCounts(const std::vector &sentence, - int32 weight) { +void LanguageModelEstimator::AddCounts(const std::vector &sentence) { KALDI_ASSERT(opts_.ngram_order >= 2 && "--ngram-order must be >= 2"); KALDI_ASSERT(opts_.ngram_order >= opts_.no_prune_ngram_order); int32 order = opts_.ngram_order; @@ -37,23 +36,23 @@ void LanguageModelEstimator::AddCounts(const std::vector &sentence, end = sentence.end(); for (; iter != end; ++iter) { KALDI_ASSERT(*iter != 0); - IncrementCount(history, *iter, weight); + IncrementCount(history, *iter); history.push_back(*iter); if (history.size() >= order) history.erase(history.begin()); } // Probability of end of sentence. This will end up getting ignored later, but // it still makes a difference for probability-normalization reasons. - IncrementCount(history, 0, weight); + IncrementCount(history, 0); } void LanguageModelEstimator::IncrementCount(const std::vector &history, - int32 next_phone, int32 weight) { + int32 next_phone) { int32 lm_state_index = FindOrCreateLmStateIndexForHistory(history); if (lm_states_[lm_state_index].tot_count == 0) { num_active_lm_states_++; } - lm_states_[lm_state_index].AddCount(next_phone, weight); + lm_states_[lm_state_index].AddCount(next_phone, 1.0); } void LanguageModelEstimator::SetParentCounts() { diff --git a/src/chain/language-model.h b/src/chain/language-model.h index 123d5ab830f..b2c3f4cd746 100644 --- a/src/chain/language-model.h +++ b/src/chain/language-model.h @@ -91,7 +91,7 @@ class LanguageModelEstimator { // Adds counts for this sentence. Basically does: for each n-gram in the // sentence, count[n-gram] += 1. The only constraint on 'sentence' is that it // should contain no zeros. - void AddCounts(const std::vector &sentence, int32 weight); + void AddCounts(const std::vector &sentence); // Estimates the LM and outputs it as an FST. Note: there is // no concept here of backoff arcs. @@ -188,7 +188,7 @@ class LanguageModelEstimator { // adds the counts for this ngram (called from AddCounts()). inline void IncrementCount(const std::vector &history, - int32 next_phone, int32 weight); + int32 next_phone); // Computes whether backoff should be allowed for this lm_state. (the caller diff --git a/src/chainbin/nnet3-chain-combine.cc b/src/chainbin/nnet3-chain-combine.cc index ca0428553c1..00682d48280 100644 --- a/src/chainbin/nnet3-chain-combine.cc +++ b/src/chainbin/nnet3-chain-combine.cc @@ -54,17 +54,18 @@ double ComputeObjf(bool batchnorm_test_mode, bool dropout_test_mode, end = egs.end(); for (; iter != end; ++iter) prob_computer->Compute(*iter); - const ChainObjectiveInfo *objf_info = - prob_computer->GetObjective("output"); - if (objf_info == NULL) + + std::pair pair = prob_computer->GetTotalObjective(); + BaseFloat tot_weight = pair.second; + double tot_objf = pair.first; + + if (tot_weight == 0.0) KALDI_ERR << "Error getting objective info (unsuitable egs?)"; - KALDI_ASSERT(objf_info->tot_weight > 0.0); // inf/nan tot_objf->return -inf objective. - double tot_objf = objf_info->tot_like + objf_info->tot_l2_term; if (!(tot_objf == tot_objf && tot_objf - tot_objf == 0)) return -std::numeric_limits::infinity(); // we prefer to deal with normalized objective functions. - return tot_objf / objf_info->tot_weight; + return tot_objf / tot_weight; } } From a1224eeea978174c46dfd4c57d9c8a122dbf4d49 Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Mon, 5 Feb 2018 00:37:07 -0500 Subject: [PATCH 121/174] Minor bug fix --- egs/wsj/s5/steps/segmentation/combine_targets_dirs.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/egs/wsj/s5/steps/segmentation/combine_targets_dirs.sh b/egs/wsj/s5/steps/segmentation/combine_targets_dirs.sh index f6be21e16f3..8135d089f5b 100755 --- a/egs/wsj/s5/steps/segmentation/combine_targets_dirs.sh +++ b/egs/wsj/s5/steps/segmentation/combine_targets_dirs.sh @@ -49,7 +49,7 @@ for d in $*; do cat $d/targets.scp done | sort -k1,1 > $dest/targets.scp || exit 1 -steps/segmentation/verify_targets_dir.sh $data $dest || exit 1 +steps/segmentation/validate_targets_dir.sh $dest $data || exit 1 echo "Combined targets and stored in $dest" exit 0 From 715f219415ee4e36403f354f8f1b86a0b5327d4d Mon Sep 17 00:00:00 2001 From: Pegita Date: Tue, 13 Feb 2018 18:06:30 -0500 Subject: [PATCH 122/174] added new functions to accept NnetExample in nnet-chain-training.cc. --- src/chain/chain-supervision.cc | 31 ++++++ src/chain/chain-supervision.h | 4 + src/chain/chain-training.cc | 51 ++++++++++ src/chain/chain-training.h | 12 ++- src/chainbin/Makefile | 2 +- src/chainbin/nnet3-chain-get-egs.cc | 4 +- src/latbin/lattice-1best.cc | 6 +- src/latbin/lattice-to-fst.cc | 150 +++++++++++++++++++++++----- src/nnet3/nnet-chain-training.cc | 116 ++++++++++++++++++++- src/nnet3/nnet-chain-training.h | 10 ++ src/nnet3/nnet-example-utils.cc | 50 +++++++++- src/nnet3/nnet-example-utils.h | 19 ++++ src/nnet3bin/nnet3-get-egs.cc | 6 +- 13 files changed, 422 insertions(+), 39 deletions(-) diff --git a/src/chain/chain-supervision.cc b/src/chain/chain-supervision.cc index b5597b15667..7d87201dfdd 100644 --- a/src/chain/chain-supervision.cc +++ b/src/chain/chain-supervision.cc @@ -650,6 +650,37 @@ void AppendSupervision(const std::vector &input, } } +bool AddWeightToFst(const fst::StdVectorFst &normalization_fst, + fst::StdVectorFst *supervision_fst) { + // remove epsilons before composing. 'normalization_fst' has noepsilons so + // the composed result will be epsilon free. + fst::StdVectorFst supervision_fst_noeps(*supervision_fst); + fst::RmEpsilon(&supervision_fst_noeps); + if (!TryDeterminizeMinimize(kSupervisionMaxStates, + &supervision_fst_noeps)) + return false; + + // note: by default, 'Compose' will call 'Connect', so if the + // resulting FST is not connected, it will end up empty. + fst::StdVectorFst composed_fst; + fst::Compose(supervision_fst_noeps, normalization_fst, + &composed_fst); + if (composed_fst.NumStates() == 0) + return false; + // projection should not be necessary, as both FSTs are acceptors. + // determinize and minimize to make it as compact as possible. + + if (!TryDeterminizeMinimize(kSupervisionMaxStates, + &composed_fst)) + return false; + *supervision_fst = composed_fst; + // Make sure the states are numbered in increasing order of time. + SortBreadthFirstSearch(supervision_fst); + KALDI_ASSERT(supervision_fst->Properties(fst::kAcceptor, true) == fst::kAcceptor); + KALDI_ASSERT(supervision_fst->Properties(fst::kIEpsilons, true) == 0); + return true; +} + bool AddWeightToSupervisionFst(const fst::StdVectorFst &normalization_fst, Supervision *supervision) { // remove epsilons before composing. 'normalization_fst' has noepsilons so diff --git a/src/chain/chain-supervision.h b/src/chain/chain-supervision.h index a94f68ade90..c54d4770aa0 100644 --- a/src/chain/chain-supervision.h +++ b/src/chain/chain-supervision.h @@ -323,6 +323,10 @@ class SupervisionSplitter { bool AddWeightToSupervisionFst(const fst::StdVectorFst &normalization_fst, Supervision *supervision); + +bool AddWeightToFst(const fst::StdVectorFst &normalization_fst, + fst::StdVectorFst *supervision_fst); + /// Assuming the 'fst' is epsilon-free, connected, and has the property that all /// paths from the start-state are of the same length, output a vector /// containing that length (from the start-state to the current state) to diff --git a/src/chain/chain-training.cc b/src/chain/chain-training.cc index 53de69a0e07..40108636da0 100644 --- a/src/chain/chain-training.cc +++ b/src/chain/chain-training.cc @@ -25,6 +25,57 @@ namespace kaldi { namespace chain { +void ComputeObjfAndDeriv2(const ChainTrainingOptions &opts, + const DenominatorGraph &den_graph, + const GeneralMatrix &supervision, + const CuMatrixBase &nnet_output, + BaseFloat *objf, + BaseFloat *l2_term, + BaseFloat *weight, + CuMatrixBase *nnet_output_deriv, + CuMatrixBase *xent_output_deriv) { + if (nnet_output_deriv) { + nnet_output_deriv->SetZero(); + nnet_output_deriv->CopyFromMat(supervision.GetFullMatrix()); + if (xent_output_deriv) + xent_output_deriv->CopyFromMat(*nnet_output_deriv); + } else if (xent_output_deriv) { + // this branch will be taken if xent_output_deriv but not + // nnet_output_deriv is set- which could happen if you want to compute the + // cross-entropy objective but not the derivatives. + xent_output_deriv->SetZero(); + xent_output_deriv->CopyFromMat(supervision.GetFullMatrix()); + } + int32 num_sequences = 64, + frames_per_sequence = 150; + BaseFloat sup_weight = 1.0; + DenominatorComputation denominator(opts, den_graph, + num_sequences, + nnet_output); + BaseFloat den_logprob = denominator.Forward(); + bool ok = true; + if (nnet_output_deriv) + ok = denominator.Backward(-sup_weight, nnet_output_deriv); + // we don't consider log-prob w.r.t numerator. + *objf = -sup_weight * den_logprob; + *weight = sup_weight * num_sequences * frames_per_sequence; + + if (!((*objf) - (*objf) == 0) || !ok) { + // inf or NaN detected, or denominator computation returned false. + if (nnet_output_deriv) + nnet_output_deriv->SetZero(); + if (xent_output_deriv) + xent_output_deriv->SetZero(); + BaseFloat default_objf = -10; + KALDI_WARN << "Objective function is " << (*objf) + << " and denominator computation (if done) returned " + << std::boolalpha << ok + << ", setting objective function to " << default_objf + << " per frame."; + *objf = default_objf * *weight; + } +} + void ComputeChainObjfAndDeriv(const ChainTrainingOptions &opts, const DenominatorGraph &den_graph, const Supervision &supervision, diff --git a/src/chain/chain-training.h b/src/chain/chain-training.h index e6143d10846..8c276a4854f 100644 --- a/src/chain/chain-training.h +++ b/src/chain/chain-training.h @@ -63,7 +63,7 @@ struct ChainTrainingOptions { ChainTrainingOptions(): l2_regularize(0.0), leaky_hmm_coefficient(1.0e-05), xent_regularize(0.0) { } - + void Register(OptionsItf *opts) { opts->Register("l2-regularize", &l2_regularize, "l2 regularization " "constant for 'chain' training, applied to the output " @@ -121,8 +121,16 @@ void ComputeChainObjfAndDeriv(const ChainTrainingOptions &opts, BaseFloat *weight, CuMatrixBase *nnet_output_deriv, CuMatrixBase *xent_output_deriv = NULL); - +void ComputeObjfAndDeriv2(const ChainTrainingOptions &opts, + const DenominatorGraph &den_graph, + const GeneralMatrix &supervision, + const CuMatrixBase &nnet_output, + BaseFloat *objf, + BaseFloat *l2_term, + BaseFloat *weight, + CuMatrixBase *nnet_output_deriv, + CuMatrixBase *xent_output_deriv = NULL); } // namespace chain } // namespace kaldi diff --git a/src/chainbin/Makefile b/src/chainbin/Makefile index 096040000eb..2ee87d7ec33 100644 --- a/src/chainbin/Makefile +++ b/src/chainbin/Makefile @@ -7,7 +7,7 @@ LDFLAGS += $(CUDA_LDFLAGS) LDLIBS += $(CUDA_LDLIBS) BINFILES = chain-est-phone-lm chain-get-supervision chain-make-den-fst \ - nnet3-chain-get-egs nnet3-chain-copy-egs nnet3-chain-merge-egs \ + nnet3-chain-get-egs nnet3-chain-get-egs-post nnet3-chain-copy-egs nnet3-chain-merge-egs \ nnet3-chain-shuffle-egs nnet3-chain-subset-egs \ nnet3-chain-acc-lda-stats nnet3-chain-train nnet3-chain-compute-prob \ nnet3-chain-combine nnet3-chain-normalize-egs diff --git a/src/chainbin/nnet3-chain-get-egs.cc b/src/chainbin/nnet3-chain-get-egs.cc index c8c251900ec..206921771c8 100644 --- a/src/chainbin/nnet3-chain-get-egs.cc +++ b/src/chainbin/nnet3-chain-get-egs.cc @@ -42,6 +42,7 @@ static bool ProcessFile(const fst::StdVectorFst &normalization_fst, const GeneralMatrix &feats, const MatrixBase *ivector_feats, int32 ivector_period, + const Lattice &lattice, const chain::Supervision &supervision, const std::string &utt_id, bool compress, @@ -278,12 +279,13 @@ int main(int argc, char *argv[]) { num_err++; continue; } - + /* if (!ProcessFile(normalization_fst, feats, online_ivector_feats, online_ivector_period, supervision, key, compress, &utt_splitter, &example_writer)) num_err++; + */ } } if (num_err > 0) diff --git a/src/latbin/lattice-1best.cc b/src/latbin/lattice-1best.cc index f6723687790..f325cb3016e 100644 --- a/src/latbin/lattice-1best.cc +++ b/src/latbin/lattice-1best.cc @@ -61,9 +61,9 @@ int main(int argc, char *argv[]) { lats_wspecifier = po.GetArg(2); SequentialCompactLatticeReader clat_reader(lats_rspecifier); - + // Write as compact lattice. - CompactLatticeWriter compact_1best_writer(lats_wspecifier); + CompactLatticeWriter compact_1best_writer(lats_wspecifier); int32 n_done = 0, n_err = 0; @@ -77,7 +77,7 @@ int main(int argc, char *argv[]) { CompactLattice best_path; CompactLatticeShortestPath(clat, &best_path); - + if (best_path.Start() == fst::kNoStateId) { KALDI_WARN << "Possibly empty lattice for utterance-id " << key << "(no output)"; diff --git a/src/latbin/lattice-to-fst.cc b/src/latbin/lattice-to-fst.cc index 0d2ac29a99b..19f8bf453c1 100644 --- a/src/latbin/lattice-to-fst.cc +++ b/src/latbin/lattice-to-fst.cc @@ -22,6 +22,50 @@ #include "util/common-utils.h" #include "fstext/fstext-lib.h" #include "lat/kaldi-lattice.h" +#include "hmm/transition-model.h" + +namespace kaldi { + +void ConvertLatticeToPdfLabels( + const TransitionModel &tmodel, + const Lattice &ifst, + fst::StdVectorFst *ofst) { + typedef fst::ArcTpl ArcIn; + typedef fst::StdArc ArcOut; + typedef ArcIn::StateId StateId; + ofst->DeleteStates(); + // The states will be numbered exactly the same as the original FST. + // Add the states to the new FST. + StateId num_states = ifst.NumStates(); + for (StateId s = 0; s < num_states; s++) { + StateId news = ofst->AddState(); + assert(news == s); + } + ofst->SetStart(ifst.Start()); + for (StateId s = 0; s < num_states; s++) { + LatticeWeight final_iweight = ifst.Final(s); + if (final_iweight != LatticeWeight::Zero()) { + fst::TropicalWeight final_oweight; + ConvertLatticeWeight(final_iweight, &final_oweight); + ofst->SetFinal(s, final_oweight); + } + for (fst::ArcIterator iter(ifst, s); + !iter.Done(); + iter.Next()) { + ArcIn arc = iter.Value(); + KALDI_PARANOID_ASSERT(arc.weight != LatticeWeight::Zero()); + ArcOut oarc; + ConvertLatticeWeight(arc.weight, &oarc.weight); + oarc.ilabel = tmodel.TransitionIdToPdf(arc.ilabel) + 1; + oarc.olabel = arc.olabel; + oarc.nextstate = arc.nextstate; + ofst->AddArc(s, oarc); + } + } +} + +} + int main(int argc, char *argv[]) { try { @@ -34,20 +78,33 @@ int main(int argc, char *argv[]) { using std::vector; BaseFloat acoustic_scale = 0.0; BaseFloat lm_scale = 0.0; - bool rm_eps = true; - + bool rm_eps = true, read_compact = true, convert_to_pdf_labels = false; + std::string trans_model; + bool project_input = false, project_output = true; + const char *usage = "Turn lattices into normal FSTs, retaining only the word labels\n" "By default, removes all weights and also epsilons (configure with\n" "with --acoustic-scale, --lm-scale and --rm-eps)\n" "Usage: lattice-to-fst [options] lattice-rspecifier fsts-wspecifier\n" " e.g.: lattice-to-fst ark:1.lats ark:1.fsts\n"; - + ParseOptions po(usage); + po.Register("read-compact", &read_compact, "Read compact lattice"); po.Register("acoustic-scale", &acoustic_scale, "Scaling factor for acoustic likelihoods"); po.Register("lm-scale", &lm_scale, "Scaling factor for graph/lm costs"); po.Register("rm-eps", &rm_eps, "Remove epsilons in resulting FSTs (in lazy way; may not remove all)"); - + po.Register("convert-to-pdf-labels", &convert_to_pdf_labels, + "Convert lattice to pdf labels"); + po.Register("trans-model", &trans_model, + "Transition model"); + po.Register("project-input", &project_input, + "Project to input labels (transition-ids); applicable only " + "when --read-compact=false"); + po.Register("project-output", &project_output, + "Project to output labels (transition-ids); applicable only " + "when --read-compact=false"); + po.Read(argc, argv); if (po.NumArgs() != 2) { @@ -56,35 +113,74 @@ int main(int argc, char *argv[]) { } vector > scale = fst::LatticeScale(lm_scale, acoustic_scale); - + std::string lats_rspecifier = po.GetArg(1), fsts_wspecifier = po.GetArg(2); - - SequentialCompactLatticeReader lattice_reader(lats_rspecifier); + + TransitionModel tmodel; + if (!trans_model.empty()) { + ReadKaldiObject(trans_model, &tmodel); + } + + SequentialCompactLatticeReader compact_lattice_reader; + SequentialLatticeReader lattice_reader; + TableWriter fst_writer(fsts_wspecifier); - + int32 n_done = 0; // there is no failure mode, barring a crash. - for (; !lattice_reader.Done(); lattice_reader.Next()) { - std::string key = lattice_reader.Key(); - CompactLattice clat = lattice_reader.Value(); - lattice_reader.FreeCurrent(); - ScaleLattice(scale, &clat); // typically scales to zero. - RemoveAlignmentsFromCompactLattice(&clat); // remove the alignments... - fst::VectorFst fst; - { - Lattice lat; - ConvertLattice(clat, &lat); // convert to non-compact form.. won't introduce - // extra states because already removed alignments. - ConvertLattice(lat, &fst); // this adds up the (lm,acoustic) costs to get - // the normal (tropical) costs. - Project(&fst, fst::PROJECT_OUTPUT); // Because in the standard Lattice format, - // the words are on the output, and we want the word labels. + if (read_compact) { + SequentialCompactLatticeReader compact_lattice_reader(lats_rspecifier); + for (; !compact_lattice_reader.Done(); compact_lattice_reader.Next()) { + std::string key = compact_lattice_reader.Key(); + CompactLattice clat = compact_lattice_reader.Value(); + compact_lattice_reader.FreeCurrent(); + ScaleLattice(scale, &clat); // typically scales to zero. + RemoveAlignmentsFromCompactLattice(&clat); // remove the alignments... + fst::VectorFst fst; + { + Lattice lat; + ConvertLattice(clat, &lat); // convert to non-compact form.. won't introduce + // extra states because already removed alignments. + + if (convert_to_pdf_labels) { + ConvertLatticeToPdfLabels(tmodel, lat, &fst); // this adds up the (lm,acoustic) costs to get + // the normal (tropical) costs. + } else { + ConvertLattice(lat, &fst); + } + + Project(&fst, fst::PROJECT_OUTPUT); // Because in the standard compact_lattice format, + // the words are on the output, and we want the word labels. + } + if (rm_eps) RemoveEpsLocal(&fst); + + fst_writer.Write(key, fst); + n_done++; + } + } else { + SequentialLatticeReader lattice_reader(lats_rspecifier); + for (; !lattice_reader.Done(); lattice_reader.Next()) { + std::string key = lattice_reader.Key(); + Lattice lat = lattice_reader.Value(); + lattice_reader.FreeCurrent(); + ScaleLattice(scale, &lat); // typically scales to zero. + fst::VectorFst fst; + if (convert_to_pdf_labels) { + ConvertLatticeToPdfLabels(tmodel, lat, &fst); + } else { + ConvertLattice(lat, &fst); + } + if (project_input) + Project(&fst, fst::PROJECT_INPUT); + else if (project_output) + Project(&fst, fst::PROJECT_OUTPUT); + if (rm_eps) RemoveEpsLocal(&fst); + + fst_writer.Write(key, fst); + n_done++; } - if (rm_eps) RemoveEpsLocal(&fst); - - fst_writer.Write(key, fst); - n_done++; + } KALDI_LOG << "Done converting " << n_done << " lattices to word-level FSTs"; return (n_done != 0 ? 0 : 1); diff --git a/src/nnet3/nnet-chain-training.cc b/src/nnet3/nnet-chain-training.cc index 780a7115a8a..4c799ea96c3 100644 --- a/src/nnet3/nnet-chain-training.cc +++ b/src/nnet3/nnet-chain-training.cc @@ -57,6 +57,22 @@ NnetChainTrainer::NnetChainTrainer(const NnetChainTrainingOptions &opts, } } +void NnetChainTrainer::Train(const NnetExample &eg) { + bool need_model_derivative = true; + const NnetTrainerOptions &nnet_config = opts_.nnet_config; + bool use_xent_regularization = (opts_.chain_config.xent_regularize != 0.0); + ComputationRequest request; + GetComputationRequest(*nnet_, eg, need_model_derivative, + nnet_config.store_component_stats, + use_xent_regularization, need_model_derivative, + &request); + const NnetComputation *computation = compiler_.Compile(request); + + // conventional training + TrainInternal(eg, *computation); + + num_minibatches_processed_++; +} void NnetChainTrainer::Train(const NnetChainExample &chain_eg) { bool need_model_derivative = true; @@ -91,6 +107,41 @@ void NnetChainTrainer::Train(const NnetChainExample &chain_eg) { num_minibatches_processed_++; } +void NnetChainTrainer::TrainInternal(const NnetExample &eg, + const NnetComputation &computation) { + const NnetTrainerOptions &nnet_config = opts_.nnet_config; + NnetComputer computer(nnet_config.compute_config, computation, + *nnet_, delta_nnet_); + // give the inputs to the computer object + computer.AcceptInputs(*nnet_, eg.io); + computer.Run(); + + this->ProcessOutputs(eg, &computer); + computer.Run(); + + // If relevant, add in the part of the gradient that comes from L2 + // regularization. + ApplyL2Regularization(*nnet_, + GetNumNvalues(eg.io, false) * + nnet_config.l2_regularize_factor, + delta_nnet_); + + // Updates the parameters of nnet + bool success = UpdateNnetWithMaxChange(*delta_nnet_, + nnet_config.max_param_change, 1.0, 1.0 - nnet_config.momentum, nnet_, + &num_max_change_per_component_applied_, &num_max_change_global_applied_); + + // Scale down the batchnorm stats (keeps them fresh... this affects what + // happens when we use the model with batchnorm test-mode set). + ScaleBatchnormStats(nnet_config.batchnorm_stats_scale, nnet_); + + // Scale delta_nnet + if (success) + ScaleNnet(nnet_config.momentum, delta_nnet_); + else + ScaleNnet(0.0, delta_nnet_); +} + void NnetChainTrainer::TrainInternal(const NnetChainExample &eg, const NnetComputation &computation) { const NnetTrainerOptions &nnet_config = opts_.nnet_config; @@ -170,6 +221,69 @@ void NnetChainTrainer::TrainInternalBackstitch(const NnetChainExample &eg, ScaleNnet(0.0, delta_nnet_); } +void NnetChainTrainer::ProcessOutputs(const NnetExample &eg, + NnetComputer *computer) { + std::vector::const_iterator iter = eg.io.begin(), + end = eg.io.end(); + for (; iter != end; ++iter) { + const NnetIo &io = *iter; + int32 node_index = nnet_->GetNodeIndex(io.name); + KALDI_ASSERT(node_index >= 0); + if (nnet_->IsOutputNode(node_index)) { + const CuMatrixBase &nnet_output = computer->GetOutput(io.name); + CuMatrix nnet_output_deriv(nnet_output.NumRows(), + nnet_output.NumCols(), + kUndefined); + bool use_xent = (opts_.chain_config.xent_regularize != 0.0); + std::string xent_name = io.name + "-xent"; // typically "output-xent". + CuMatrix xent_deriv; + if (use_xent) + xent_deriv.Resize(nnet_output.NumRows(), nnet_output.NumCols(), + kUndefined); + + BaseFloat tot_objf, tot_l2_term, tot_weight; + + ComputeObjfAndDeriv2(opts_.chain_config, den_graph_, + io.features, + nnet_output, + &tot_objf, &tot_l2_term, &tot_weight, + &nnet_output_deriv, + (use_xent ? &xent_deriv : NULL)); + if (use_xent) { + // this block computes the cross-entropy objective. + const CuMatrixBase &xent_output = computer->GetOutput( + xent_name); + // at this point, xent_deriv is posteriors derived from the numerato + // computation. note, xent_objf has a factor of '.supervision.weight' + CuMatrix cu_post(io.features.GetFullMatrix()); + BaseFloat xent_objf = TraceMatMat(xent_output, cu_post, kTrans); + objf_info_[xent_name].UpdateStats(xent_name, + opts_.nnet_config.print_interval, + num_minibatches_processed_, + tot_weight, xent_objf); + } + + //if (opts_.apply_deriv_weights && sup.deriv_weights.Dim() != 0) { + if (opts_.apply_deriv_weights) { + CuVector cu_deriv_weights; + nnet_output_deriv.MulRowsVec(cu_deriv_weights); + if (use_xent) + xent_deriv.MulRowsVec(cu_deriv_weights); + } + computer->AcceptInput(io.name, &nnet_output_deriv); + + objf_info_[io.name].UpdateStats(io.name, + opts_.nnet_config.print_interval, + num_minibatches_processed_, + tot_weight, tot_objf, tot_l2_term); + if (use_xent) { + xent_deriv.Scale(opts_.chain_config.xent_regularize); + computer->AcceptInput(xent_name, &xent_deriv); + } + } + } +} + void NnetChainTrainer::ProcessOutputs(bool is_backstitch_step2, const NnetChainExample &eg, NnetComputer *computer) { @@ -214,7 +328,7 @@ void NnetChainTrainer::ProcessOutputs(bool is_backstitch_step2, // at this point, xent_deriv is posteriors derived from the numerator // computation. note, xent_objf has a factor of '.supervision.weight' BaseFloat xent_objf = TraceMatMat(xent_output, xent_deriv, kTrans); - objf_info_[xent_name + suffix].UpdateStats(xent_name + suffix, + objf_info_[xent_name].UpdateStats(xent_name, opts_.nnet_config.print_interval, num_minibatches_processed_, tot_weight, xent_objf); diff --git a/src/nnet3/nnet-chain-training.h b/src/nnet3/nnet-chain-training.h index 5bf6a3f6fce..6e9bbe57ef1 100644 --- a/src/nnet3/nnet-chain-training.h +++ b/src/nnet3/nnet-chain-training.h @@ -61,6 +61,9 @@ class NnetChainTrainer { // train on one minibatch. void Train(const NnetChainExample &eg); + // train on one minibatch using NnetExample + void Train(const NnetExample &eg); + // Prints out the final stats, and return true if there was a nonzero count. bool PrintTotalStats() const; @@ -74,6 +77,10 @@ class NnetChainTrainer { void TrainInternal(const NnetChainExample &eg, const NnetComputation &computation); + // The internal function for doing one step of conventional SGD training. + void TrainInternal(const NnetExample &eg, + const NnetComputation &computation); + // The internal function for doing one step of backstitch training. Depending // on whether is_backstitch_step1 is true, It could be either the first // (backward) step, or the second (forward) step of backstitch. @@ -84,6 +91,9 @@ class NnetChainTrainer { void ProcessOutputs(bool is_backstitch_step2, const NnetChainExample &eg, NnetComputer *computer); + void ProcessOutputs(const NnetExample &eg, + NnetComputer *computer); + const NnetChainTrainingOptions opts_; chain::DenominatorGraph den_graph_; diff --git a/src/nnet3/nnet-example-utils.cc b/src/nnet3/nnet-example-utils.cc index 65df0c891c1..2151e06bbb4 100644 --- a/src/nnet3/nnet-example-utils.cc +++ b/src/nnet3/nnet-example-utils.cc @@ -198,6 +198,54 @@ void ShiftExampleTimes(int32 t_offset, } } } +void GetComputationRequest(const Nnet &nnet, + const NnetExample &eg, + bool need_model_derivative, + bool store_component_stats, + bool use_xent_regularization, + bool use_xent_derivative, + ComputationRequest *request) { + request->inputs.clear(); + request->inputs.reserve(eg.io.size()); + request->outputs.clear(); + request->outputs.reserve(eg.io.size() * 2); + request->need_model_derivative = need_model_derivative; + request->store_component_stats = store_component_stats; + for (size_t i = 0; i < eg.io.size(); i++) { + const NnetIo &io = eg.io[i]; + const std::string &name = io.name; + int32 node_index = nnet.GetNodeIndex(name); + if (node_index == -1 && + !nnet.IsInputNode(node_index) && !nnet.IsOutputNode(node_index)) + KALDI_ERR << "Nnet example has input or output named '" << name + << "', but no such input or output node is in the network."; + + std::vector &dest = + nnet.IsInputNode(node_index) ? request->inputs : request->outputs; + dest.resize(dest.size() + 1); + IoSpecification &io_spec = dest.back(); + io_spec.name = name; + io_spec.indexes = io.indexes; + io_spec.has_deriv = nnet.IsOutputNode(node_index) && need_model_derivative; + if (use_xent_regularization && nnet.IsOutputNode(node_index)) { + size_t cur_size = request->outputs.size(); + request->outputs.resize(cur_size + 1); + IoSpecification &io_spec = request->outputs[cur_size - 1], + io_spec_xent = request->outputs[cur_size]; + // the IoSpecification for the -xent output is the same + // as for the regular output, except for its name which has + // the -xent suffix (and the has_deriv member may differ). + io_spec_xent = io_spec; + io_spec_xent.name = name + "-xent"; + io_spec_xent.has_deriv = use_xent_derivative; + } + } + // check to see if something went wrong. + if (request->inputs.empty()) + KALDI_ERR << "No inputs in computation request."; + if (request->outputs.empty()) + KALDI_ERR << "No outputs in computation request."; +} void GetComputationRequest(const Nnet &nnet, const NnetExample &eg, @@ -207,7 +255,7 @@ void GetComputationRequest(const Nnet &nnet, request->inputs.clear(); request->inputs.reserve(eg.io.size()); request->outputs.clear(); - request->outputs.reserve(eg.io.size()); + request->outputs.reserve(eg.io.size() * 2); request->need_model_derivative = need_model_derivative; request->store_component_stats = store_component_stats; for (size_t i = 0; i < eg.io.size(); i++) { diff --git a/src/nnet3/nnet-example-utils.h b/src/nnet3/nnet-example-utils.h index 02620df7485..05f35fb44de 100644 --- a/src/nnet3/nnet-example-utils.h +++ b/src/nnet3/nnet-example-utils.h @@ -64,6 +64,25 @@ void GetComputationRequest(const Nnet &nnet, ComputationRequest *computation_request); +/** This function takes NnetExample and produces a ComputatioRequest. + It assumes you don't want the derivatives w.r.t the input; + + If use_xent_regularization == true, then it assumes that for each output + name (e.g. "output" in the eg, there is another output with the same + dimension and with the suffix "-xent" on its name, e.g. named + "output-xent". The derivative w.r.t. the xent objective will only be + supplied to the nnet computation if 'use_xent_derivative' is true (we + propagate back the xent derivative to the model only in training, not in + model-combination in nnet3-chain-combine). +*/ +void GetComputationRequest(const Nnet &nnet, + const NnetExample &eg, + bool need_model_derivative, + bool store_component_stats, + bool use_xent_regularization, + bool use_xent_derivative, + ComputationRequest *computation_request); + // Writes as unsigned char a vector 'vec' that is required to have // values between 0 and 1. void WriteVectorAsChar(std::ostream &os, diff --git a/src/nnet3bin/nnet3-get-egs.cc b/src/nnet3bin/nnet3-get-egs.cc index cec9549541d..de7904a8d6c 100644 --- a/src/nnet3bin/nnet3-get-egs.cc +++ b/src/nnet3bin/nnet3-get-egs.cc @@ -166,7 +166,7 @@ int main(int argc, char *argv[]) { bool compress = true; int32 num_pdfs = -1, length_tolerance = 100, - targets_length_tolerance = 2, + targets_length_tolerance = 2, online_ivector_period = 1; ExampleGenerationConfig eg_config; // controls num-frames, @@ -192,7 +192,7 @@ int main(int argc, char *argv[]) { "--online-ivectors option"); po.Register("length-tolerance", &length_tolerance, "Tolerance for " "difference in num-frames between feat and ivector matrices"); - po.Register("targets-length-tolerance", &targets_length_tolerance, + po.Register("targets-length-tolerance", &targets_length_tolerance, "Tolerance for " "difference in num-frames (after subsampling) between " "feature matrix and posterior"); @@ -260,7 +260,7 @@ int main(int argc, char *argv[]) { } if (!ProcessFile(feats, online_ivector_feats, online_ivector_period, - pdf_post, key, compress, num_pdfs, + pdf_post, key, compress, num_pdfs, targets_length_tolerance, &utt_splitter, &example_writer)) num_err++; From 474e865fdaf542c69a675340cd086fb8dc3040ee Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Tue, 20 Feb 2018 18:32:24 -0500 Subject: [PATCH 123/174] LF-SMBR training --- .../nnet3/train/chain_objf/acoustic_model.py | 2 +- .../steps/libs/nnet3/xconfig/basic_layers.py | 93 ++++++++++++------- egs/wsj/s5/steps/nnet3/chain/get_egs.sh | 5 + egs/wsj/s5/steps/nnet3/chain/train.py | 18 ++-- src/chain/chain-training.cc | 10 +- src/cudamatrix/cu-kernels-ansi.h | 4 + src/cudamatrix/cu-kernels.cu | 23 +++++ src/cudamatrix/cu-kernels.h | 10 ++ src/cudamatrix/cu-matrix.cc | 28 ++++++ src/cudamatrix/cu-matrix.h | 3 + 10 files changed, 154 insertions(+), 42 deletions(-) diff --git a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py index 3fb7fe14e76..3bd0c538951 100644 --- a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py +++ b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py @@ -92,7 +92,7 @@ def generate_chain_egs(dir, data, lat_dir, egs_dir, --alignment-subsampling-factor {alignment_subsampling_factor} \ --stage {stage} \ --frames-per-iter {frames_per_iter} \ - --frames-per-eg {frames_per_eg_str} \ + --frames-per-eg "{frames_per_eg_str}" \ --srand {srand} \ {data} {dir} {lat_dir} {egs_dir}""".format( get_egs_script=get_egs_script, diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py index aafe02cff6c..5f62500a76e 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py @@ -464,6 +464,8 @@ class XconfigOutputLayer(XconfigLayerBase): ng-affine-options='' : Can be used supply non-default options to the affine layer (intended for the natural gradient but can be an arbitrary string to be added to the config line. e.g. 'update-period=2'.). + offset-file='' : If specified, then an offset component replaces the + affine component and the presoftmax-scale-file. """ def __init__(self, first_token, key_to_value, prev_names=None): @@ -491,12 +493,13 @@ def set_default_configs(self): 'bias-stddev': 0.0, 'l2-regularize': 0.0, 'output-delay': 0, - 'ng-affine-options': '' + 'ng-affine-options': '', + 'offset-file': '' } def check_configs(self): - if self.config['dim'] <= -1: + if self.config['offset-file'] != '' and self.config['dim'] <= -1: raise RuntimeError("In output-layer, dim has invalid value {0}" "".format(self.config['dim'])) @@ -513,7 +516,9 @@ def check_configs(self): def auxiliary_outputs(self): - auxiliary_outputs = ['affine'] + auxiliary_outputs = [] + if self.config['offset-file'] == '': + auxiliary_outputs.append('affine') if self.config['include-log-softmax']: auxiliary_outputs.append('log-softmax') @@ -542,6 +547,10 @@ def output_dim(self, auxiliary_output=None): # make sense. raise RuntimeError("Outputs of output-layer may not be used by other" " layers") + + if self.config['offset-file'] != '': + return self.descriptors['input']['dim'] + return self.config['dim'] def get_full_config(self): @@ -554,7 +563,8 @@ def get_full_config(self): # config-files, i.e. it contains the 'final' names of nodes. descriptor_final_string = self.descriptors['input']['final-string'] input_dim = self.descriptors['input']['dim'] - output_dim = self.config['dim'] + output_dim = (self.config['dim'] if self.config['offset-file'] == '' + else input_dim) objective_type = self.config['objective-type'] learning_rate_factor = self.config['learning-rate-factor'] include_log_softmax = self.config['include-log-softmax'] @@ -569,44 +579,63 @@ def get_full_config(self): learning_rate_factor != 1.0 else '') l2_regularize_option = ('l2-regularize={0} '.format(l2_regularize) if l2_regularize != 0.0 else '') + offset_file = self.config['offset-file'] # note: ref.config is used only for getting the left-context and # right-context of the network; # final.config is where we put the actual network definition. for config_name in ['ref', 'final']: # First the affine node. - line = ('component name={0}.affine' - ' type=NaturalGradientAffineComponent' - ' input-dim={1}' - ' output-dim={2}' - ' param-stddev={3}' - ' bias-stddev={4}' - ' max-change={5} {6} {7} {8}' - ''.format(self.name, input_dim, output_dim, - param_stddev, bias_stddev, max_change, ng_affine_options, - learning_rate_option, l2_regularize_option)) - ans.append((config_name, line)) - line = ('component-node name={0}.affine' - ' component={0}.affine input={1}' - ''.format(self.name, descriptor_final_string)) - ans.append((config_name, line)) - cur_node = '{0}.affine'.format(self.name) - - if presoftmax_scale_file is not '' and config_name == 'final': - # don't use the presoftmax-scale in 'ref.config' since that - # file won't exist at the time we evaluate it. - # (ref.config is used to find the left/right context). - line = ('component name={0}.fixed-scale' - ' type=FixedScaleComponent scales={1}' - ''.format(self.name, presoftmax_scale_file)) + if self.config['offset-file'] == '': + line = ('component name={0}.affine' + ' type=NaturalGradientAffineComponent' + ' input-dim={1}' + ' output-dim={2}' + ' param-stddev={3}' + ' bias-stddev={4}' + ' max-change={5} {6} {7} {8}' + ''.format(self.name, input_dim, output_dim, + param_stddev, bias_stddev, max_change, ng_affine_options, + learning_rate_option, l2_regularize_option)) + ans.append((config_name, line)) - line = ('component-node name={0}.fixed-scale' - ' component={0}.fixed-scale input={1}' - ''.format(self.name, cur_node)) + line = ('component-node name={0}.affine' + ' component={0}.affine input={1}' + ''.format(self.name, descriptor_final_string)) + ans.append((config_name, line)) + cur_node = '{0}.affine'.format(self.name) + + if presoftmax_scale_file is not '' and config_name == 'final': + # don't use the presoftmax-scale in 'ref.config' since that + # file won't exist at the time we evaluate it. + # (ref.config is used to find the left/right context). + line = ('component name={0}.fixed-scale' + ' type=FixedScaleComponent scales={1}' + ''.format(self.name, presoftmax_scale_file)) + ans.append((config_name, line)) + + line = ('component-node name={0}.fixed-scale' + ' component={0}.fixed-scale input={1}' + ''.format(self.name, cur_node)) + ans.append((config_name, line)) + cur_node = '{0}.fixed-scale'.format(self.name) + else: + line = ('component name={0}.offset' + ' type=PerElementOffsetComponent' + ' vector={1}' + ' max-change={2} {3} {4} {5}' + ''.format(self.name, self.config['offset-file'], + max_change, ng_affine_options, + learning_rate_option, l2_regularize_option)) + ans.append((config_name, line)) + + line = ('component-node name={0}.offset' + ' component={0}.offset input={1}' + ''.format(self.name, descriptor_final_string)) ans.append((config_name, line)) - cur_node = '{0}.fixed-scale'.format(self.name) + cur_node = '{0}.offset'.format(self.name) if include_log_softmax: line = ('component name={0}.log-softmax' diff --git a/egs/wsj/s5/steps/nnet3/chain/get_egs.sh b/egs/wsj/s5/steps/nnet3/chain/get_egs.sh index e2f9526be34..77ff540149a 100755 --- a/egs/wsj/s5/steps/nnet3/chain/get_egs.sh +++ b/egs/wsj/s5/steps/nnet3/chain/get_egs.sh @@ -151,6 +151,11 @@ if $no_chunking; then cut -d ' ' -f 1 $data/utt2spk | \ utils/shuffle_list.pl | head -$num_utts_subset > $dir/valid_uttlist || exit 1; else + if [ -z "$frames_per_eg" ]; then + echo "$0: --frames-per-eg is expected if --no-chunking is false" + exit 1 + fi + cat $data/utt2dur | \ awk -v min_len=$frames_per_eg -v fs=$frame_shift '{if ($2 * 1/fs >= min_len) print $1}' | \ utils/shuffle_list.pl | head -$num_utts_subset > $dir/valid_uttlist || exit 1; diff --git a/egs/wsj/s5/steps/nnet3/chain/train.py b/egs/wsj/s5/steps/nnet3/chain/train.py index 71a11879977..ff49becd067 100755 --- a/egs/wsj/s5/steps/nnet3/chain/train.py +++ b/egs/wsj/s5/steps/nnet3/chain/train.py @@ -52,7 +52,7 @@ def get_args(): # egs extraction options parser.add_argument("--egs.chunk-width", type=str, dest='chunk_width', - default="20", + default=None, action=common_lib.NullstrToNoneAction, help="""Number of frames per chunk in the examples used to train the RNN. Caution: if you double this you should halve --trainer.samples-per-iter. May be @@ -93,9 +93,6 @@ def get_args(): parser.add_argument("--chain.leaky-hmm-coefficient", type=float, dest='leaky_hmm_coefficient', default=0.00001, help="") - parser.add_argument("--chain.smbr-leaky-hmm-coefficient", type=float, - dest='smbr_leaky_hmm_coefficient', default=0.00001, - help="") parser.add_argument("--chain.apply-deriv-weights", type=str, dest='apply_deriv_weights', default=True, action=common_lib.StrToBoolAction, @@ -137,6 +134,9 @@ def get_args(): parser.add_argument("--chain.smbr-l2-regularize", default=None, dest='smbr_l2_regularize', type=float, help="L2 regularizer term used with sMBR training") + parser.add_argument("--chain.smbr-leaky-hmm-coefficient", type=float, + dest='smbr_leaky_hmm_coefficient', default=None, + help="") # trainer options parser.add_argument("--trainer.input-model", type=str, @@ -236,7 +236,8 @@ def process_args(args): """ Process the options got from get_args() """ - if not common_train_lib.validate_chunk_width(args.chunk_width): + if (args.chunk_width is not None and + not common_train_lib.validate_chunk_width(args.chunk_width)): raise Exception("--egs.chunk-width has an invalid value") if not common_train_lib.validate_minibatch_size_str(args.num_chunk_per_minibatch): @@ -443,7 +444,8 @@ def train(args, run_opts): right_tolerance=args.right_tolerance, frame_subsampling_factor=args.frame_subsampling_factor, alignment_subsampling_factor=args.alignment_subsampling_factor, - frames_per_eg_str=args.chunk_width, + frames_per_eg_str=(args.chunk_width if args.chunk_width is not None + else ""), srand=args.srand, egs_opts=args.egs_opts, cmvn_opts=args.cmvn_opts, @@ -465,7 +467,7 @@ def train(args, run_opts): egs_left_context, egs_right_context, egs_left_context_initial, egs_right_context_final)) - assert(args.chunk_width == frames_per_eg_str) + assert(args.chunk_width is None or args.chunk_width == frames_per_eg_str) num_archives_expanded = num_archives * args.frame_subsampling_factor if (args.num_jobs_final > num_archives_expanded): @@ -634,7 +636,7 @@ def train(args, run_opts): l2_regularize=l2_regularize, xent_regularize=xent_regularize, leaky_hmm_coefficient=(args.smbr_leaky_hmm_coefficient - if smbr_factor > 0.0 + if smbr_factor > 0.0 and args.smbr_leaky_hmm_coefficient is not None else args.leaky_hmm_coefficient), momentum=args.momentum, max_param_change=args.max_param_change, diff --git a/src/chain/chain-training.cc b/src/chain/chain-training.cc index e99d32228a2..a836f4d7342 100644 --- a/src/chain/chain-training.cc +++ b/src/chain/chain-training.cc @@ -103,12 +103,20 @@ void ComputeChainObjfAndDeriv(const ChainTrainingOptions &opts, if (opts.l2_regularize == 0.0) { *l2_term = 0.0; - } else { + } else if (!opts.norm_regularize) { // compute the l2 penalty term and its derivative BaseFloat scale = supervision.weight * opts.l2_regularize; *l2_term = -0.5 * scale * TraceMatMat(nnet_output, nnet_output, kTrans); if (nnet_output_deriv) nnet_output_deriv->AddMat(-1.0 * scale, nnet_output); + } else { + // compute the l2 penalty term and its derivative + BaseFloat scale = supervision.weight * opts.l2_regularize; + CuMatrix exp_nnet_output(nnet_output); + exp_nnet_output.ApplyExp(); + *l2_term = -scale * exp_nnet_output.Sum(); + if (nnet_output_deriv) + nnet_output_deriv->AddMat(-1.0 * scale, exp_nnet_output); } } diff --git a/src/cudamatrix/cu-kernels-ansi.h b/src/cudamatrix/cu-kernels-ansi.h index a5a88f29f7e..39fcdc9dac8 100644 --- a/src/cudamatrix/cu-kernels-ansi.h +++ b/src/cudamatrix/cu-kernels-ansi.h @@ -562,6 +562,10 @@ void cudaD_mul_cols_vec(dim3 Gr, dim3 Bl, double *mat, const double *scale, MatrixDim d); void cudaF_mul_cols_vec(dim3 Gr, dim3 Bl, float *mat, const float *scale, MatrixDim d); +void cudaD_mul_cols_group_vec(dim3 Gr, dim3 Bl, double *mat, const double *scale, + MatrixDim d, int group_size); +void cudaF_mul_cols_group_vec(dim3 Gr, dim3 Bl, float *mat, const float *scale, + MatrixDim d, int group_size); void cudaD_mul_elements(dim3 Gr, dim3 Bl, double *mat, const double *A, MatrixDim dst_d, int src_stride); void cudaF_mul_elements(dim3 Gr, dim3 Bl, float *mat, const float *A, diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu index b726c8580e2..2ad7c80d30c 100644 --- a/src/cudamatrix/cu-kernels.cu +++ b/src/cudamatrix/cu-kernels.cu @@ -564,6 +564,19 @@ static void _mul_cols_vec(Real* mat, const Real* scale, MatrixDim d) { mat[index] *= scale[i]; } +template +__global__ +static void _mul_cols_group_vec(Real* mat, const Real* scale, MatrixDim d, + int group_size) { + int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x; + int32_cuda j = blockIdx.y * blockDim.y + threadIdx.y; + int32_cuda index = i + j * d.stride; + + if (i < d.cols && j < d.rows) { + mat[index] *= scale[i % group_size]; + } +} + template __global__ static void _mul_rows_vec(Real* mat, const Real* scale, MatrixDim d) { @@ -3780,6 +3793,11 @@ void cudaF_mul_cols_vec(dim3 Gr, dim3 Bl, float* mat, const float* scale, _mul_cols_vec<<>>(mat,scale,d); } +void cudaF_mul_cols_group_vec(dim3 Gr, dim3 Bl, float* mat, const float* scale, + MatrixDim d, int group_size) { + _mul_cols_group_vec<<>>(mat,scale,d,group_size); +} + void cudaF_mul_rows_vec(dim3 Gr, dim3 Bl, float* mat, const float* scale, MatrixDim d) { _mul_rows_vec<<>>(mat,scale,d); @@ -4476,6 +4494,11 @@ void cudaD_mul_cols_vec(dim3 Gr, dim3 Bl, double* mat, const double* scale, _mul_cols_vec<<>>(mat,scale,d); } +void cudaD_mul_cols_group_vec(dim3 Gr, dim3 Bl, double* mat, const double* scale, + MatrixDim d, int group_size) { + _mul_cols_group_vec<<>>(mat,scale,d,group_size); +} + void cudaD_mul_rows_vec(dim3 Gr, dim3 Bl, double* mat, const double* scale, MatrixDim d) { _mul_rows_vec<<>>(mat,scale,d); diff --git a/src/cudamatrix/cu-kernels.h b/src/cudamatrix/cu-kernels.h index 5e4c832b1e9..4d89360a26d 100644 --- a/src/cudamatrix/cu-kernels.h +++ b/src/cudamatrix/cu-kernels.h @@ -1065,6 +1065,16 @@ inline void cuda_mul_cols_vec(dim3 Gr, dim3 Bl, float *mat, const float *scale, MatrixDim d) { cudaF_mul_cols_vec(Gr, Bl, mat, scale, d); } +inline void cuda_mul_cols_group_vec(dim3 Gr, dim3 Bl, double *mat, + const double *scale, MatrixDim d, + int group_size) { + cudaD_mul_cols_group_vec(Gr, Bl, mat, scale, d, group_size); +} +inline void cuda_mul_cols_group_vec(dim3 Gr, dim3 Bl, float *mat, + const float *scale, MatrixDim d, + int group_size) { + cudaF_mul_cols_group_vec(Gr, Bl, mat, scale, d, group_size); +} inline void cuda_mul_elements(dim3 Gr, dim3 Bl, double *mat, const double *A, MatrixDim dst_d, int src_stride) { cudaD_mul_elements(Gr, Bl, mat, A, dst_d, src_stride); diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc index 56d7316c7d8..642a79fb3c1 100644 --- a/src/cudamatrix/cu-matrix.cc +++ b/src/cudamatrix/cu-matrix.cc @@ -774,6 +774,34 @@ void CuMatrixBase::MulColsVec(const CuVectorBase &scale) { } +template +void CuMatrixBase::MulColsGroupVec(const CuVectorBase &scale) { +#if HAVE_CUDA == 1 + if (CuDevice::Instantiate().Enabled()) { + CuTimer tim; + + dim3 dimGrid, dimBlock; + GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(), + &dimGrid, &dimBlock); + + cuda_mul_cols_group_vec(dimGrid, dimBlock, data_, scale.data_, Dim(), + scale.Dim()); + CU_SAFE_CALL(cudaGetLastError()); + + + CuDevice::Instantiate().AccuProfile(__func__, tim); + } else +#endif + { + int32 num_groups = NumCols() / scale.Dim(); + for (int32 i = 0; i < num_groups; i++) { + CuSubMatrix this_mat(*this, 0, NumRows(), + i * scale.Dim(), scale.Dim()); + this_mat.Mat().MulColsVec(scale.Vec()); + } + } +} + template void CuMatrixBase::MulRowsVec(const CuVectorBase &scale) { diff --git a/src/cudamatrix/cu-matrix.h b/src/cudamatrix/cu-matrix.h index cc7f683cdd5..ecd5210de51 100644 --- a/src/cudamatrix/cu-matrix.h +++ b/src/cudamatrix/cu-matrix.h @@ -439,6 +439,9 @@ class CuMatrixBase { void Min(const CuMatrixBase &A); /// scale i'th column by scale[i] void MulColsVec(const CuVectorBase &scale); + /// Divide each row into groups of size scale.Dim() and multiply + /// j^th element in each group of each row by scale[j]. + void MulColsGroupVec(const CuVectorBase &scale); /// scale i'th row by scale[i] void MulRowsVec(const CuVectorBase &scale); /// divide each row into src.NumCols() groups, and then scale i'th row's jth group of elements by src[i, j]. From 91889b157103df2770f78eb8f2e745b0407acf88 Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Wed, 21 Feb 2018 00:15:53 -0500 Subject: [PATCH 124/174] Aspire changes --- egs/aspire/s5/RESULTS | 2 + .../s5/local/chain/tuning/run_tdnn_lstm_1a.sh | 328 ++++++++++++++++++ .../generate_uniformly_segmented_data_dir.sh | 59 ++-- .../local/multi_condition/aspire_data_prep.sh | 27 +- .../local/multi_condition/run_nnet2_common.sh | 3 +- .../s5/local/multi_condition/run_nnet2_ms.sh | 3 +- egs/aspire/s5/local/nnet3/prep_test_aspire.sh | 19 +- .../s5/local/nnet3/prep_test_aspire_online.sh | 28 +- .../nnet3/prep_test_aspire_segmentation.sh | 170 +++++++++ .../s5/local/nnet3/run_ivector_common.sh | 16 +- egs/aspire/s5/local/run_asr_segmentation.sh | 220 ++++++++++++ .../tuning/train_lstm_asr_sad_1a.sh | 138 ++++++++ .../tuning/train_stats_asr_sad_1a.sh | 136 ++++++++ .../convert_utt2spk_and_segments_to_rttm.py | 2 +- .../segmentation/detect_speech_activity.sh | 2 +- .../internal/get_transform_probs_mat.py | 7 +- .../internal/prepare_sad_graph.py | 3 +- 17 files changed, 1079 insertions(+), 84 deletions(-) create mode 100755 egs/aspire/s5/local/chain/tuning/run_tdnn_lstm_1a.sh create mode 100755 egs/aspire/s5/local/nnet3/prep_test_aspire_segmentation.sh create mode 100755 egs/aspire/s5/local/run_asr_segmentation.sh create mode 100755 egs/aspire/s5/local/segmentation/tuning/train_lstm_asr_sad_1a.sh create mode 100755 egs/aspire/s5/local/segmentation/tuning/train_stats_asr_sad_1a.sh diff --git a/egs/aspire/s5/RESULTS b/egs/aspire/s5/RESULTS index 3310529a338..b1f18af14cd 100755 --- a/egs/aspire/s5/RESULTS +++ b/egs/aspire/s5/RESULTS @@ -52,3 +52,5 @@ for x in exp/*/decode_dev; do grep WER $x/wer_* | utils/best_wer.sh; done # local/chain/run_blstm_7b.sh %WER 24.6 | 2120 27224 | 82.0 11.5 6.5 6.7 24.6 74.6 | -0.976 | exp/chain/blstm_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v7_iterfinal_pp_fg/score_8/penalty_0.75/ctm.filt.filt.sys +# local/chain/run_tdnn_lstm_1a.sh +%WER 23.6 | 2120 27219 | 82.8 11.5 5.7 6.5 23.6 73.8 | -0.675 | exp/chain/tdnn_lstm_1a/decode_dev_aspire_whole_uniformsegmented_win10_over5_v8_iterfinal_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys diff --git a/egs/aspire/s5/local/chain/tuning/run_tdnn_lstm_1a.sh b/egs/aspire/s5/local/chain/tuning/run_tdnn_lstm_1a.sh new file mode 100755 index 00000000000..1dc6346b5a2 --- /dev/null +++ b/egs/aspire/s5/local/chain/tuning/run_tdnn_lstm_1a.sh @@ -0,0 +1,328 @@ +#!/bin/bash + +set -e + +# based on run_tdnn_7b.sh in the swbd recipe + +# configs for 'chain' +affix=v8 + +stage=0 +train_stage=-10 +get_egs_stage=-10 +test_stage=1 +nj=70 + +tdnn_affix=_1a +tree_affix=bi_a +chain_affix= + +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +# training options +num_epochs=2 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +label_delay=5 + +# decode options +extra_left_context=50 +extra_right_context=0 +decode_iter= + +# training options +num_epochs=4 +remove_egs=false +common_egs_dir= + +num_data_reps=3 +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat < $lat_dir/uttlist.$n.$nj + done + + rm -f $lat_dir/lat_tmp.*.{ark,scp} 2>/dev/null + + norvb_nj=$(cat $norvb_lat_dir/num_jobs) + $train_cmd JOB=1:$norvb_nj $lat_dir/log/copy_lattices.JOB.log \ + lattice-copy "ark:gunzip -c $norvb_lat_dir/lat.JOB.gz |" \ + ark,scp:$lat_dir/lat_tmp.JOB.ark,$lat_dir/lat_tmp.JOB.scp || exit 1 + + for n in `seq 3`; do + cat $lat_dir/lat_tmp.*.scp | awk -v n=$n '{print "rev"n"_"$1" "$2}' + done > $lat_dir/lat_rvb.scp + + $train_cmd JOB=1:$nj $lat_dir/log/copy_rvb_lattices.JOB.log \ + lattice-copy \ + "scp:utils/filter_scp.pl data/${train_set}/split$nj/JOB/utt2spk $lat_dir/lat_rvb.scp |" \ + "ark:| gzip -c > $lat_dir/lat.JOB.gz" || exit 1 + + rm $lat_dir/lat_tmp.* $lat_dir/lat_rvb.scp + + echo $nj > $lat_dir/num_jobs + + for f in cmvn_opts final.mdl splice_opts tree; do + cp $norvb_lat_dir/$f $lat_dir/$f + done +fi + +if [ $stage -le 10 ]; then + # Create a version of the lang/ directory that has one state per phone in the + # topo file. [note, it really has two states.. the first one is only repeated + # once, the second one has zero or more repeats.] + rm -rf $lang + cp -r data/lang $lang + silphonelist=$(cat $lang/phones/silence.csl) || exit 1; + nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1; + # Use our special topology... note that later on may have to tune this + # topology. + steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + # we build the tree using clean features (data/train) rather than + # the augmented features (data/train_rvb) to get better alignments + + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate -1 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 7000 data/train $lang exp/tri5a $treedir || exit 1 +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm4 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm4 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/aspire-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + mkdir -p $dir/egs + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true" \ + --egs.chunk-width 160,140,110,80 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph_pp +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_pp_test $dir $graph_dir +fi + +if [ $stage -le 15 ]; then + rm $dir/.error 2>/dev/null || true + + for d in dev_rvb test_rvb; do + ( + if [ ! -f exp/nnet3${nnet3_affix}/ivectors_${d}/ivector_online.scp ]; then + steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \ + data/${d}_hires exp/nnet3${nnet3_affix}/extractor \ + exp/nnet3${nnet3_affix}/ivectors_${d} || { echo "Failed i-vector extraction for data/${d}_hires"; touch $dir/.error; } + fi + + decode_dir=$dir/decode_${d}_pp + steps/nnet3/decode.sh --nj 30 --cmd "$decode_cmd" --config conf/decode.config \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + --frames-per-chunk 160 \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${d} \ + $graph_dir data/${d}_hires $decode_dir || { echo "Failed decoding in $decode_dir"; touch $dir/.error; } + ) & + done + wait + + if [ -f $dir/.error ]; then + echo "Failed decoding." + exit 1 + fi +fi + +if [ $stage -le 16 ]; then +#%WER 27.8 | 2120 27217 | 78.2 13.6 8.2 6.0 27.8 75.9 | -0.613 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iterfinal_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys + local/nnet3/prep_test_aspire.sh --stage $test_stage --decode-num-jobs 30 --affix "$affix" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --window 10 --overlap 5 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + --sub-speaker-frames 6000 --max-count 75 --ivector-scale 0.75 \ + --pass2-decode-opts "--min-active 1000" \ + dev_aspire_ldc data/lang $dir/graph_pp $dir +fi + +if [ $stage -le 17 ]; then +# #Online decoding example +# %WER 31.5 | 2120 27224 | 74.0 13.0 13.0 5.5 31.5 77.1 | -0.558 | exp/chain/tdnn_7b_online/decode_dev_aspire_whole_uniformsegmented_win10_over5_v9_online_iterfinal_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys + + local/nnet3/prep_test_aspire_online.sh --stage $test_stage --decode-num-jobs 30 --affix "$affix" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --window 10 --overlap 5 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 \ + --max-count 75 \ + --pass2-decode-opts "--min-active 1000" \ + dev_aspire_ldc data/lang $dir/graph_pp $dir +fi + + + + +exit 0; + +# %WER 32.7 | 2120 27222 | 73.6 15.3 11.2 6.3 32.7 78.5 | -0.530 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter100_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 30.4 | 2120 27211 | 74.8 12.7 12.5 5.1 30.4 77.0 | -0.458 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter200_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys +# %WER 29.1 | 2120 27216 | 76.6 13.8 9.6 5.7 29.1 76.8 | -0.527 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter300_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.8 | 2120 27211 | 77.0 13.8 9.2 5.8 28.8 76.3 | -0.587 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter400_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.7 | 2120 27218 | 77.1 13.8 9.1 5.8 28.7 77.0 | -0.566 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter500_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.5 | 2120 27210 | 77.5 13.9 8.7 6.0 28.5 76.1 | -0.596 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter600_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.2 | 2120 27217 | 77.0 12.4 10.6 5.2 28.2 75.8 | -0.540 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter700_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys +# %WER 28.4 | 2120 27218 | 77.6 13.6 8.8 6.0 28.4 76.3 | -0.607 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter800_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.2 | 2120 27208 | 77.4 12.6 10.0 5.6 28.2 76.6 | -0.555 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter900_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys +# %WER 27.8 | 2120 27214 | 78.0 13.5 8.5 5.9 27.8 75.9 | -0.631 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1000_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 27.9 | 2120 27216 | 77.6 13.0 9.4 5.5 27.9 76.1 | -0.544 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1200_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys +# %WER 27.8 | 2120 27216 | 77.4 13.1 9.5 5.3 27.8 75.7 | -0.615 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1300_pp_fg/score_9/penalty_0.25/ctm.filt.filt.sys +# %WER 27.7 | 2120 27220 | 78.1 13.6 8.3 5.8 27.7 75.1 | -0.569 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1400_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 27.7 | 2120 27217 | 78.1 13.6 8.3 5.9 27.7 75.1 | -0.605 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1500_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys diff --git a/egs/aspire/s5/local/generate_uniformly_segmented_data_dir.sh b/egs/aspire/s5/local/generate_uniformly_segmented_data_dir.sh index 05f1752cda5..a531e6c8202 100755 --- a/egs/aspire/s5/local/generate_uniformly_segmented_data_dir.sh +++ b/egs/aspire/s5/local/generate_uniformly_segmented_data_dir.sh @@ -32,49 +32,50 @@ data_set=$1 if [ "$data_set" == "dev_aspire" ]; then if [ $stage -le 1 ]; then - echo "$0 : Creating the data dir with whole recordings without segmentation" + echo "$0: Creating the data dir with whole recordings without segmentation" # create a whole directory without the segments - unseg_dir=data/${data_set}_whole - src_dir=data/$data_set - mkdir -p $unseg_dir - echo "$0 : Creating the $unseg_dir/wav.scp file" - cp $src_dir/wav.scp $unseg_dir + unseg_dir=data/${data_set}_whole_hires + src_dir=data/${data_set} + utils/data/convert_data_dir_to_whole.sh $src_dir $unseg_dir - echo "$0 : Creating the $unseg_dir/reco2file_and_channel file" + echo "$0: Creating the $unseg_dir/reco2file_and_channel file" cat $unseg_dir/wav.scp | awk '{print $1, $1, "A";}' > $unseg_dir/reco2file_and_channel - cat $unseg_dir/wav.scp | awk '{print $1, $1;}' > $unseg_dir/utt2spk - utils/utt2spk_to_spk2utt.pl $unseg_dir/utt2spk > $unseg_dir/spk2utt - fi data_set=${data_set}_whole +else + utils/copy_data_dir.sh data/$data_set data/${data_set}_hires fi -segmented_data_set=${data_set}_uniformsegmented_win${window}_over${overlap} if [ $stage -le 2 ]; then - echo "$0 : Generating uniform segments with length $window and overlap $overlap." - [ -d data/$segmented_data_set ] && rm -r data/$segmented_data_set - utils/copy_data_dir.sh --validate-opts "--no-text" \ - data/$data_set data/$segmented_data_set - cp data/$data_set/reco2file_and_channel data/$segmented_data_set - - local/multi_condition/create_uniform_segments.py \ - --overlap $overlap --window $window data/$segmented_data_set - - for file in cmvn.scp feats.scp; do - rm -f data/$segmented_data_set/$file - done - utils/validate_data_dir.sh --no-text --no-feats data/$segmented_data_set + echo "$0: Extracting features" + steps/make_mfcc.sh --cmd "$train_cmd" --nj $num_jobs \ + --mfcc-config conf/mfcc_hires.conf data/${data_set}_hires + + steps/compute_cmvn_stats.sh data/${data_set}_hires + + utils/fix_data_dir.sh data/${data_set}_hires + utils/validate_data_dir.sh --no-text data/${data_set}_hires fi +segmented_data_set=${data_set}_uniformsegmented_win${window}_over${overlap} if [ $stage -le 3 ]; then - echo "$0 : Extracting features for the uniformly segmented dir" + echo "$0 : Generating uniform segments with length $window and overlap $overlap." [ -d data/${segmented_data_set}_hires ] && rm -r data/${segmented_data_set}_hires - utils/copy_data_dir.sh --validate-opts "--no-text " \ - data/${segmented_data_set} data/${segmented_data_set}_hires + if [ ! -f data/${data_set}_hires/segments ]; then + utils/data/get_segments_for_data.sh data/${data_set}_hires > \ + data/${data_set}_hires/segments + fi - steps/make_mfcc.sh --cmd "$train_cmd" --nj $num_jobs \ - --mfcc-config conf/mfcc_hires.conf data/${segmented_data_set}_hires + mkdir -p data/${segmented_data_set}_hires + + utils/data/get_uniform_subsegments.py \ + --max-segment-duration=$window \ + --overlap-duration=$overlap \ + --max-remaining-duration=$(perl -e "print $window/ 2.0") \ + data/${data_set}_hires/segments > data/${segmented_data_set}_hires/sub_segments + utils/data/subsegment_data_dir.sh data/${data_set}_hires \ + data/${segmented_data_set}_hires/sub_segments data/${segmented_data_set}_hires steps/compute_cmvn_stats.sh data/${segmented_data_set}_hires utils/fix_data_dir.sh data/${segmented_data_set}_hires diff --git a/egs/aspire/s5/local/multi_condition/aspire_data_prep.sh b/egs/aspire/s5/local/multi_condition/aspire_data_prep.sh index 72f338e25b1..cca9c6cce69 100755 --- a/egs/aspire/s5/local/multi_condition/aspire_data_prep.sh +++ b/egs/aspire/s5/local/multi_condition/aspire_data_prep.sh @@ -3,17 +3,16 @@ # Apache 2.0. set -e stage=0 -aspire_data=/export/corpora5/ASpIRE/ +aspire_data=/export/common/data/corpora/LDC/LDC2017S21 mean_rms=0.0417 # determined from the mean rms value of data/train_rvb/mean_rms . ./path.sh # Needed for KALDI_ROOT . utils/parse_options.sh -dev_transcript=$aspire_data/ASpIRE_single_dev_transcript -dev_audio=$aspire_data/ASpIRE_single_dev -test_audio=$aspire_data/ASpIRE_single_dev_test -eval_audio=$aspire_data/ASpIRE_single_eval -if [ ! -f $aspire_data/glm ]; then +dev_transcript=$aspire_data/dev_and_dev_test_STM_files +dev_audio=$aspire_data/dev_and_dev_test_audio/ASpIRE_single_dev +test_audio=$aspire_data/dev_and_dev_test_audio/ASpIRE_single_dev_test +if [ ! -f $aspire_data/my_english.glm ]; then echo "Expected to find the glm file, provided in ASpIRE challenge." echo "Please provide the glm file in $aspire_data." && exit 1; fi @@ -22,12 +21,11 @@ tmpdir=`pwd`/data/local/data mkdir -p $tmpdir if [ $stage -le 0 ]; then - find $dev_transcript/ -name '*.stm' > $tmpdir/transcripts.flist + find $dev_transcript/ -name 'dev.stm' > $tmpdir/transcripts.flist find $dev_audio/ -name '*.wav' > $tmpdir/wav.flist find $test_audio/ -name '*.wav' > $tmpdir/wav_test.flist - find $eval_audio/ -name '*.wav' > $tmpdir/wav_eval.flist - n=`cat $tmpdir/transcripts.flist | wc -l` + n=`cut -d' ' -f1 $(cat $tmpdir/transcripts.flist) | uniq | wc -l` if [ $n -ne 30 ]; then echo "Expected to find 30 transcript files in the aspire_single_dev_transcript directory, found $n" exit 1; @@ -42,11 +40,6 @@ if [ $stage -le 0 ]; then echo "Expected to find 60 .wav files in the aspire_single_dev_test data, found $n" exit 1; fi - n=`cat $tmpdir/wav_eval.flist | wc -l` - if [ $n -ne 120 ]; then - echo "Expected to find 120 .wav files in the aspire_single_eval data, found $n" - exit 1; - fi fi # create the dev_aspire files @@ -122,12 +115,12 @@ for line in sys.stdin.readlines(): print '{0} sox --vol {1} {2} -r 8000 -t wav - |'.format(file_id, out_rms, line) "| sort -k1,1 -u > $dev/wav.scp || exit 1; cat $dev/wav.scp |awk '{printf("%s %s A\n", $1, $1)}' > $dev/reco2file_and_channel - cp $aspire_data/glm $dev + cp $aspire_data/my_english.glm $dev/glm fi # prepare the eval and test data if [ $stage -le 4 ]; then - for dataset in test eval; do + for dataset in test ; do test=data/${dataset}_aspire mkdir -p $test for f in `cat $tmpdir/wav_${dataset}.flist`; do @@ -153,7 +146,7 @@ for line in lines: cat $test/wav.scp |awk '{printf("%s %s\n", $1, $1)}' > $test/utt2spk cat $test/wav.scp |awk '{printf("%s %s\n", $1, $1)}' > $test/spk2utt cat $test/wav.scp |awk '{printf("%s %s A\n", $1, $1)}' > $test/reco2file_and_channel - cp $aspire_data/glm $test + cp $aspire_data/my_english.glm $test/glm done fi diff --git a/egs/aspire/s5/local/multi_condition/run_nnet2_common.sh b/egs/aspire/s5/local/multi_condition/run_nnet2_common.sh index 2f5a74a2a51..697a89b580e 100755 --- a/egs/aspire/s5/local/multi_condition/run_nnet2_common.sh +++ b/egs/aspire/s5/local/multi_condition/run_nnet2_common.sh @@ -5,6 +5,7 @@ stage=1 +aspire_data= foreground_snrs="20:10:15:5:0" background_snrs="20:10:15:5:0" num_data_reps=3 @@ -58,7 +59,7 @@ if [ $stage -le 1 ]; then done # create the dev, test and eval sets from the aspire recipe - local/multi_condition/aspire_data_prep.sh + local/multi_condition/aspire_data_prep.sh --aspire-data $aspire_data # copy the alignments for the newly created utterance ids ali_dirs= diff --git a/egs/aspire/s5/local/multi_condition/run_nnet2_ms.sh b/egs/aspire/s5/local/multi_condition/run_nnet2_ms.sh index 56b2de399f2..73a8cf8e718 100755 --- a/egs/aspire/s5/local/multi_condition/run_nnet2_ms.sh +++ b/egs/aspire/s5/local/multi_condition/run_nnet2_ms.sh @@ -12,6 +12,7 @@ stage=1 train_stage=-10 use_gpu=true +aspire_data= dir=exp/nnet2_multicondition/nnet_ms_a set -e @@ -51,7 +52,7 @@ else fi # do the common parts of the script. -local/multi_condition/run_nnet2_common.sh --stage $stage +local/multi_condition/run_nnet2_common.sh --stage $stage --aspire-data $aspire_data if [ $stage -le 7 ]; then diff --git a/egs/aspire/s5/local/nnet3/prep_test_aspire.sh b/egs/aspire/s5/local/nnet3/prep_test_aspire.sh index cb69aaff10b..64d24669438 100755 --- a/egs/aspire/s5/local/nnet3/prep_test_aspire.sh +++ b/egs/aspire/s5/local/nnet3/prep_test_aspire.sh @@ -59,23 +59,26 @@ model_affix=`basename $dir` ivector_dir=exp/nnet3 ivector_affix=${affix:+_$affix}_chain_${model_affix}_iter$iter affix=_${affix}_iter${iter} -act_data_set=${data_set} # we will modify the data set, when uniformly segmenting it - # so we will keep track of original data set for the glm and stm files if [ $stage -le 1 ]; then local/generate_uniformly_segmented_data_dir.sh \ --overlap $overlap --window $window $data_set fi -if [ "$data_set" == "test_aspire" ]; then +if [[ "$data_set" =~ "test_aspire" ]]; then out_file=single_dev_test${affix}_$model_affix.ctm -elif [ "$data_set" == "eval_aspire" ]; then + act_data_set=test_aspire +elif [[ "$data_set" =~ "eval_aspire" ]]; then out_file=single_eval${affix}_$model_affix.ctm -elif [ "$data_set" == "dev_aspire" ]; then + act_data_set=eval_aspire +elif [[ "$data_set" =~ "dev_aspire" ]]; then # we will just decode the directory without oracle segments file # as we would like to operate in the actual evaluation condition - data_set=${data_set}_whole out_file=single_dev${affix}_${model_affix}.ctm + act_data_set=dev_aspire +else + echo "$0: Unknown data-set $data_set" + exit 1 fi # uniform segmentation script would have created this dataset @@ -162,6 +165,8 @@ if [ $stage -le 6 ]; then --acwt $acwt --post-decode-acwt $post_decode_acwt \ --extra-left-context $extra_left_context \ --extra-right-context $extra_right_context \ + --extra-left-context-initial $extra_left_context_initial \ + --extra-right-context-final $extra_right_context_final \ --frames-per-chunk "$frames_per_chunk" \ --skip-scoring true --iter $iter --lattice-beam $lattice_beam \ --online-ivector-dir $ivector_dir/ivectors_${segmented_data_set}${ivector_affix} \ @@ -186,8 +191,6 @@ if [ $stage -le 8 ]; then --ctm-beam 6 \ --iter $iter \ --decode-mbr true \ - --window $window \ - --overlap $overlap \ --tune-hyper true \ $lang $decode_dir $act_data_set $segmented_data_set $out_file fi diff --git a/egs/aspire/s5/local/nnet3/prep_test_aspire_online.sh b/egs/aspire/s5/local/nnet3/prep_test_aspire_online.sh index 388eb980839..fce116946e3 100755 --- a/egs/aspire/s5/local/nnet3/prep_test_aspire_online.sh +++ b/egs/aspire/s5/local/nnet3/prep_test_aspire_online.sh @@ -29,6 +29,9 @@ extra_right_context=0 # change for BLSTM frames_per_chunk=50 # change for (B)LSTM acwt=0.1 # important to change this when using chain models post_decode_acwt=1.0 # important to change this when using chain models +extra_left_context_initial=-1 + +score_opts="--min-lmwt 1 --max-lmwt 20" . ./cmd.sh [ -f ./path.sh ] && . ./path.sh @@ -50,23 +53,26 @@ dir=$4 # exp/nnet3/tdnn model_affix=`basename $dir` affix=_${affix}_iter${iter} -act_data_set=${data_set} # we will modify the data set, when uniformly segmenting it - # so we will keep track of original data set for the glm and stm files if [ $stage -le 1 ]; then local/generate_uniformly_segmented_data_dir.sh \ --overlap $overlap --window $window $data_set fi -if [ "$data_set" == "test_aspire" ]; then +if [[ "$data_set" =~ "test_aspire" ]]; then out_file=single_dev_test${affix}_$model_affix.ctm -elif [ "$data_set" == "eval_aspire" ]; then + act_data_set=test_aspire +elif [[ "$data_set" =~ "eval_aspire" ]]; then out_file=single_eval${affix}_$model_affix.ctm -elif [ "$data_set" == "dev_aspire" ]; then + act_data_set=eval_aspire +elif [[ "$data_set" =~ "dev_aspire" ]]; then # we will just decode the directory without oracle segments file # as we would like to operate in the actual evaluation condition - data_set=${data_set}_whole out_file=single_dev${affix}_${model_affix}.ctm + act_data_set=dev_aspire +else + echo "$0: Unknown data-set $data_set" + exit 1 fi # uniform segmentation script would have created this dataset @@ -89,13 +95,15 @@ if [ $stage -le 3 ]; then # --frames-per-chunk "$frames_per_chunk" #--extra-left-context $extra_left_context \ #--extra-right-context $extra_right_context \ - steps/online/nnet3/decode.sh --cmd "$decode_cmd" \ + steps/online/nnet3/decode.sh --nj $decode_num_jobs --cmd "$decode_cmd" \ --config conf/decode.config $pass2_decode_opts \ --acwt $acwt --post-decode-acwt $post_decode_acwt \ + --extra-left-context-initial $extra_left_context_initial \ --silence-weight $silence_weight \ --per-utt true \ --skip-scoring true --iter $iter --lattice-beam $lattice_beam \ - $graph data/${segmented_data_set}_hires ${decode_dir}_tg + $graph data/${segmented_data_set}_hires ${decode_dir}_tg || \ + { echo "$0: Error decoding" && exit 1; } fi if [ $stage -le 4 ]; then @@ -109,13 +117,11 @@ fi decode_dir=${decode_dir}_fg if [ $stage -le 5 ]; then local/score_aspire.sh --cmd "$decode_cmd" \ - --min-lmwt 1 --max-lmwt 20 \ + $score_opts \ --word-ins-penalties "0.0,0.25,0.5,0.75,1.0" \ --ctm-beam 6 \ --iter $iter \ --decode-mbr true \ - --window $window \ - --overlap $overlap \ --tune-hyper true \ $lang $decode_dir $act_data_set $segmented_data_set $out_file fi diff --git a/egs/aspire/s5/local/nnet3/prep_test_aspire_segmentation.sh b/egs/aspire/s5/local/nnet3/prep_test_aspire_segmentation.sh new file mode 100755 index 00000000000..9f2fbff3205 --- /dev/null +++ b/egs/aspire/s5/local/nnet3/prep_test_aspire_segmentation.sh @@ -0,0 +1,170 @@ +#!/bin/bash + +# Copyright Johns Hopkins University (Author: Daniel Povey, Vijayaditya Peddinti) 2016. Apache 2.0. +# This script generates the ctm files for dev_aspire, test_aspire and eval_aspire +# for scoring with ASpIRE scoring server. +# It also provides the WER for dev_aspire data. + +set -e + +# general opts +iter=final +stage=0 +decode_num_jobs=30 +affix= + +# segmentation opts +sad_affix= +sad_opts="--extra-left-context 79 --extra-right-context 21 --frames-per-chunk 150 --extra-left-context-initial 0 --extra-right-context-final 0 --acwt 0.3" +sad_graph_opts= +sad_priors_opts= +sad_stage=0 +segment_only=false + +# ivector opts +max_count=75 # parameter for extract_ivectors.sh +sub_speaker_frames=6000 + +# decode opts +decode_opts="--min-active 1000" +lattice_beam=8 +extra_left_context=0 # change for (B)LSTM +extra_right_context=0 # change for BLSTM +frames_per_chunk=50 # change for (B)LSTM +acwt=0.1 # important to change this when using chain models +post_decode_acwt=1.0 # important to change this when using chain models +extra_left_context_initial=-1 +extra_right_context_final=-1 + +score_opts="--min-lmwt 1 --max-lmwt 20" + +. ./cmd.sh +[ -f ./path.sh ] && . ./path.sh +. utils/parse_options.sh || exit 1; + +if [ $# -ne 6 ]; then + echo "Usage: $0 [options] " + echo " Options:" + echo " --stage (0|1|2) # start scoring script from part-way through." + echo "e.g.:" + echo "$0 dev_aspire data/lang exp/tri5a/graph_pp exp/nnet3/tdnn" + exit 1; +fi + +data_set=$1 #select from {dev_aspire, test_aspire, eval_aspire}* +sad_nnet_dir=$2 +sad_work_dir=$3 +lang=$4 # data/lang +graph=$5 #exp/tri5a/graph_pp +dir=$6 # exp/nnet3/tdnn + +model_affix=`basename $dir` +ivector_root_dir=exp/nnet3 +affix=${affix:+_${affix}}${iter:+_iter${iter}} + +if [[ "$data_set" =~ "test_aspire" ]]; then + out_file=single_dev_test${affix}_$model_affix.ctm + act_data_set=test_aspire +elif [[ "$data_set" =~ "eval_aspire" ]]; then + out_file=single_eval${affix}_$model_affix.ctm + act_data_set=eval_aspire +elif [[ "$data_set" =~ "dev_aspire" ]]; then + # we will just decode the directory without oracle segments file + # as we would like to operate in the actual evaluation condition + out_file=single_dev${affix}_${model_affix}.ctm + act_data_set=dev_aspire +else + echo "$0: Unknown data-set $data_set" + exit 1 +fi + +if [ $stage -le 2 ]; then + steps/segmentation/detect_speech_activity.sh \ + --nj $decode_num_jobs --stage $sad_stage \ + --affix "$sad_affix" --graph-opts "$sad_graph_opts" \ + --transform-probs-opts "$sad_priors_opts" $sad_opts \ + data/$data_set $sad_nnet_dir mfcc_hires $sad_work_dir \ + $sad_work_dir/${data_set}${sad_affix:+_$sad_affix} || exit 1 +fi + +segmented_data_set=${data_set}${sad_affix:+_$sad_affix} + +if [ $stage -le 3 ]; then + if [ -f data/$act_data_set/ref.rttm ]; then + if [ ! -f $sad_work_dir/${segmented_data_set}_seg/reco2file_and_channel ]; then + awk '{print $2" "1}' $sad_work_dir/${segmented_data_set}_seg/segments | \ + sort -u > $sad_work_dir/${segmented_data_set}_seg/reco2file_and_channel + fi + + steps/segmentation/convert_utt2spk_and_segments_to_rttm.py \ + --reco2file-and-channel=${sad_work_dir}/${segmented_data_set}_seg/reco2file_and_channel \ + ${sad_work_dir}/${segmented_data_set}_seg/{utt2spk,segments,sys.rttm} || exit 1 + + export PATH=$PATH:$KALDI_ROOT/tools/sctk/bin + md-eval.pl -c 0.25 -r data/dev_aspire/ref.rttm \ + -s ${sad_work_dir}/${segmented_data_set}_seg/sys.rttm > \ + ${sad_work_dir}/${segmented_data_set}_seg/md_eval.log + fi +fi + +if [ $stage -le 4 ]; then + utils/copy_data_dir.sh $sad_work_dir/${segmented_data_set}_seg \ + data/${segmented_data_set}_hires + steps/compute_cmvn_stats.sh data/${segmented_data_set}_hires + utils/fix_data_dir.sh data/${segmented_data_set}_hires +fi + +if $segment_only; then + echo "$0: --segment-only is true. Exiting." + exit 0 +fi + +if [ $stage -le 5 ]; then + echo "Extracting i-vectors" + # this does offline decoding. + # the --sub-speaker-frames is optional; if provided, it will divide each speaker + # up into "sub-speakers" of at least that many frames... can be useful if + # acoustic conditions drift over time within the speaker's data. + steps/online/nnet2/extract_ivectors.sh --cmd "$train_cmd" --nj $decode_num_jobs \ + --sub-speaker-frames $sub_speaker_frames --max-count $max_count \ + data/${segmented_data_set}_hires $lang $ivector_root_dir/extractor \ + $ivector_root_dir/ivectors_${segmented_data_set} +fi + +decode_dir=$dir/decode_${segmented_data_set}${affix}_pp +if [ $stage -le 6 ]; then + echo "Generating lattices" + rm -f ${decode_dir}_tg/.error + steps/nnet3/decode.sh --nj $decode_num_jobs --cmd "$decode_cmd" --config conf/decode.config \ + --acwt $acwt --post-decode-acwt $post_decode_acwt $decode_opts \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial $extra_left_context_initial \ + --extra-right-context-final $extra_right_context_final \ + --frames-per-chunk "$frames_per_chunk" \ + --skip-scoring true --iter $iter --lattice-beam $lattice_beam \ + --online-ivector-dir $ivector_root_dir/ivectors_${segmented_data_set} \ + $graph data/${segmented_data_set}_hires ${decode_dir}_tg || \ + { echo "$0: Error decoding" && exit 1; } +fi + +if [ $stage -le 7 ]; then + echo "Rescoring lattices" + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + --skip-scoring true \ + ${lang}_pp_test{,_fg} data/${segmented_data_set}_hires \ + ${decode_dir}_{tg,fg}; +fi + +decode_dir=${decode_dir}_fg + +if [ $stage -le 8 ]; then + local/score_aspire.sh --cmd "$decode_cmd" \ + $score_opts \ + --word-ins-penalties "0.0,0.25,0.5,0.75,1.0" \ + --ctm-beam 6 \ + --iter $iter \ + --decode-mbr true \ + --tune-hyper true \ + $lang $decode_dir $act_data_set $segmented_data_set $out_file +fi diff --git a/egs/aspire/s5/local/nnet3/run_ivector_common.sh b/egs/aspire/s5/local/nnet3/run_ivector_common.sh index 2df1bf8106c..61930125cdb 100755 --- a/egs/aspire/s5/local/nnet3/run_ivector_common.sh +++ b/egs/aspire/s5/local/nnet3/run_ivector_common.sh @@ -59,16 +59,10 @@ if [ $stage -le 1 ]; then data/${data_dir} data/${data_dir}_rvb done - if $prepare_aspire_sets; then - # create the dev, test and eval sets from the aspire recipe - local/multi_condition/aspire_data_prep.sh - fi + # create the aspire dev, test sets + local/multi_condition/aspire_data_prep.sh fi -aspire_sets= -if $prepare_aspire_sets; then - aspire_sets=dev_aspire -fi if [ $stage -le 2 ]; then mfccdir=mfcc_reverb @@ -77,7 +71,7 @@ if [ $stage -le 2 ]; then utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/mfcc/aspire-$date/s5/$mfccdir/storage $mfccdir/storage fi - for data_dir in train_rvb dev_rvb test_rvb dev test $aspire_sets; do + for data_dir in train_rvb dev_rvb test_rvb dev test dev_aspire; do utils/copy_data_dir.sh data/$data_dir data/${data_dir}_hires steps/make_mfcc.sh --nj 70 --mfcc-config conf/mfcc_hires.conf \ --cmd "$train_cmd" data/${data_dir}_hires \ @@ -100,7 +94,7 @@ fi if [ $stage -le 4 ]; then # To train a diagonal UBM we don't need very much data, so use the smallest - # subset. + # subset. steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 --num-frames 400000 \ data/train_rvb_hires_30k 512 exp/nnet3/pca_transform \ exp/nnet3/diag_ubm @@ -116,7 +110,7 @@ if [ $stage -le 5 ]; then fi if [ $stage -le 6 ]; then - ivectordir=exp/nnet3/ivectors_train + ivectordir=exp/nnet3/ivectors_train_rvb if [[ $(hostname -f) == *.clsp.jhu.edu ]]; then # this shows how you can split across multiple file-systems. utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/ivectors/aspire/s5/$ivectordir/storage $ivectordir/storage fi diff --git a/egs/aspire/s5/local/run_asr_segmentation.sh b/egs/aspire/s5/local/run_asr_segmentation.sh new file mode 100755 index 00000000000..e7279ee8ac0 --- /dev/null +++ b/egs/aspire/s5/local/run_asr_segmentation.sh @@ -0,0 +1,220 @@ +#!/bin/bash + +# Copyright 2017 Nagendra Kumar Goel +# 2017 Vimal Manohar +# Apache 2.0 + +# We assume the run.sh has been executed (because we are using model +# directories like exp/tri4) + +lang=data/lang # Must match the one used to train the models +lang_test=data/lang_test # Lang directory for decoding. + +data_dir=data/train_100k +# Model directory used to align the $data_dir to get target labels for training +# SAD. This should typically be a speaker-adapted system. +sat_model_dir=exp/tri4a +# Model direcotry used to decode the whole-recording version of the $data_dir to +# get target labels for training SAD. This should typically be a +# speaker-independent system like LDA+MLLT system. +model_dir=exp/tri3a +graph_dir=exp/tri3a/graph # Graph for decoding whole-recording version of $data_dir. + # If not provided, a new one will be created using $lang_test + +# List of weights on labels obtained from alignment, +# labels obtained from decoding and default labels in out-of-segment regions +merge_weights=1.0,0.1,0.5 + +prepare_targets_stage=-10 +nstage=-10 +train_stage=-10 +test_stage=-10 +num_data_reps=3 +affix=_1a # For segmentation +test_affix=1a +stage=-1 +nj=80 +reco_nj=40 + +# test options +test_nj=30 +test_stage=1 + +. ./cmd.sh +if [ -f ./path.sh ]; then . ./path.sh; fi + +set -e -u -o pipefail +. utils/parse_options.sh + +if [ $# -ne 0 ]; then + exit 1 +fi + +dir=exp/segmentation${affix} +mkdir -p $dir + +# See $lang/phones.txt and decide which should be garbage +garbage_phones="laughter oov" +silence_phones="sil noise" + +for p in $garbage_phones; do + for a in "" "_B" "_E" "_I" "_S"; do + echo "$p$a" + done +done > $dir/garbage_phones.txt + +for p in $silence_phones; do + for a in "" "_B" "_E" "_I" "_S"; do + echo "$p$a" + done +done > $dir/silence_phones.txt + +if ! cat $dir/garbage_phones.txt $dir/silence_phones.txt | \ + steps/segmentation/internal/verify_phones_list.py $lang/phones.txt; then + echo "$0: Invalid $dir/{silence,garbage}_phones.txt" + exit 1 +fi + +data_id=$(basename $data_dir) +whole_data_dir=${data_dir}_whole +targets_dir=exp/segmentation${affix}/${data_id}_whole_combined_targets_sub3 + +rvb_data_dir=${whole_data_dir}_rvb_hires +rvb_targets_dir=${targets_dir}_rvb + +if [ $stage -le 0 ]; then + utils/data/convert_data_dir_to_whole.sh $data_dir $whole_data_dir +fi + +############################################################################### +# Extract features for the whole data directory +############################################################################### +if [ $stage -le 1 ]; then + steps/make_mfcc.sh --nj $reco_nj --cmd "$train_cmd" --write-utt2num-frames true \ + $whole_data_dir exp/make_mfcc/${data_id}_whole + steps/compute_cmvn_stats.sh $whole_data_dir exp/make_mfcc/${data_id}_whole + utils/fix_data_dir.sh $whole_data_dir +fi + +############################################################################### +# Get feats for the manual segments +############################################################################### +if [ $stage -le 2 ]; then + if [ ! -f ${data_dir}/segments ]; then + utils/data/get_segments_for_data.sh $data_dir > $data_dir/segments + fi + utils/data/subsegment_data_dir.sh $whole_data_dir ${data_dir}/segments ${data_dir}/tmp + cp $data_dir/tmp/feats.scp $data_dir + + # Use recording as the "speaker". This is required by prepare_targets_gmm.sh script. + awk '{print $1" "$2}' $data_dir/segments > $data_dir/utt2spk + utils/utt2spk_to_spk2utt.pl $data_dir/utt2spk > $data_dir/spk2utt +fi + +if [ $stage -le 3 ]; then + steps/segmentation/prepare_targets_gmm.sh --stage $prepare_targets_stage \ + --train-cmd "$train_cmd" --decode-cmd "$decode_cmd" \ + --nj $nj --reco-nj $reco_nj --lang-test $lang_test \ + --garbage-phones-list $dir/garbage_phones.txt \ + --silence-phones-list $dir/silence_phones.txt \ + --merge-weights "$merge_weights" \ + --graph-dir "$graph_dir" \ + $lang $data_dir $whole_data_dir $sat_model_dir $model_dir $dir +fi + +if [ $stage -le 4 ]; then + # Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises + if [ ! -f rirs_noises.zip ]; then + wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip + unzip rirs_noises.zip + fi + + rvb_opts=() + # This is the config for the system using simulated RIRs and point-source noises + rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/smallroom/rir_list") + rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/mediumroom/rir_list") + rvb_opts+=(--noise-set-parameters RIRS_NOISES/pointsource_noises/noise_list) + + foreground_snrs="20:10:15:5:0" + background_snrs="20:10:15:5:0" + # corrupt the data to generate multi-condition data + # for data_dir in train dev test; do + python steps/data/reverberate_data_dir.py \ + "${rvb_opts[@]}" \ + --prefix "rev" \ + --foreground-snrs $foreground_snrs \ + --background-snrs $background_snrs \ + --speech-rvb-probability 0.5 \ + --pointsource-noise-addition-probability 0.5 \ + --isotropic-noise-addition-probability 0.7 \ + --num-replications $num_data_reps \ + --max-noises-per-minute 4 \ + --source-sampling-rate 8000 \ + $whole_data_dir $rvb_data_dir +fi + +if [ $stage -le 5 ]; then + steps/make_mfcc.sh --mfcc-config conf/mfcc_hires.conf --nj $nj \ + ${rvb_data_dir} + steps/compute_cmvn_stats.sh ${rvb_data_dir} + utils/fix_data_dir.sh $rvb_data_dir +fi + +if [ $stage -le 6 ]; then + rvb_targets_dirs=() + for i in `seq 1 $num_data_reps`; do + steps/segmentation/copy_targets_dir.sh --utt-prefix "rev${i}_" \ + $targets_dir ${targets_dir}_temp_$i || exit 1 + rvb_targets_dirs+=(${targets_dir}_temp_$i) + done + + steps/segmentation/combine_targets_dirs.sh \ + $rvb_data_dir ${rvb_targets_dir} \ + ${rvb_targets_dirs[@]} || exit 1; + + rm -r ${rvb_targets_dirs[@]} +fi + +#sad_nnet=exp/segmentation${affix}/tdnn_stats_asr_sad_1a +sad_nnet_dir=exp/segmentation${affix}/tdnn_lstm_asr_sad_1a + +if [ $stage -le 7 ]; then + # # Train a STATS-pooling network for SAD + # local/segmentation/tuning/train_stats_asr_sad_1a.sh \ + # --stage $nstage --train-stage $train_stage \ + # --targets-dir ${rvb_targets_dir} \ + # --data-dir ${rvb_data_dir} --affix "1a" || exit 1 + + # Train a TDNN+LSTM network for SAD + local/segmentation/tuning/train_lstm_asr_sad_1a.sh \ + --stage $nstage --train-stage $train_stage \ + --targets-dir ${rvb_targets_dir} \ + --data-dir ${rvb_data_dir} --affix "1a" || exit 1 +fi + +if [ ! -f data/dev_aspire/wav.scp ]; then + echo "$0: Not evaluating on data/dev_aspire" + exit 0 +fi + +if [ $stage -le 8 ]; then +steps/segmentation/convert_utt2spk_and_segments_to_rttm.py \ + --reco2file-and-channel=data/dev_aspire/reco2file_and_channel \ + data/dev_aspire/{utt2spk,segments,ref.rttm} +fi + +chain_dir=exp/chain/tdnn_lstm_1a + +if [ $stage -le 9 ]; then + local/nnet3/prep_test_aspire_segmentation.sh --stage $test_stage \ + --decode-num-jobs $test_nj --affix "${test_affix}" \ + --sad-opts "--extra-left-context 70 --extra-right-context 0 --frames-per-chunk 150 --extra-left-context-initial 0 --extra-right-context-final 0 --acwt 0.3" \ + --sad-graph-opts "--min-silence-duration=0.03 --min-speech-duration=0.3 --max-speech-duration=10.0" --sad-priors-opts "--sil-scale=0.1" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 50 \ + --extra-right-context 0 \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + --sub-speaker-frames 6000 --max-count 75 \ + --decode-opts "--min-active 1000" \ + dev_aspire $sad_nnet_dir $sad_nnet_dir data/lang $chain_dir/graph_pp $chain_dir +fi diff --git a/egs/aspire/s5/local/segmentation/tuning/train_lstm_asr_sad_1a.sh b/egs/aspire/s5/local/segmentation/tuning/train_lstm_asr_sad_1a.sh new file mode 100755 index 00000000000..438cd1f1d5e --- /dev/null +++ b/egs/aspire/s5/local/segmentation/tuning/train_lstm_asr_sad_1a.sh @@ -0,0 +1,138 @@ +#!/bin/bash + +# Copyright 2017 Nagendra Kumar Goel +# 2018 Vimal Manohar +# Apache 2.0 + +# This is a script to train a TDNN for speech activity detection (SAD) +# using LSTM for long-context information. + +stage=0 +train_stage=-10 +get_egs_stage=-10 +egs_opts= + +chunk_width=20 + +extra_left_context=60 + +relu_dim=256 +cell_dim=256 +projection_dim=64 + +# training options +num_epochs=1 +initial_effective_lrate=0.0003 +final_effective_lrate=0.00003 +num_jobs_initial=3 +num_jobs_final=8 +remove_egs=true +max_param_change=0.2 # Small max-param change for small network +dropout_schedule='0,0@0.20,0.1@0.50,0' + +egs_dir= +nj=40 + +dir= +affix=1a + +data_dir=exp/segmentation_1a/train_whole_rvb_hires +targets_dir=exp/segmentation_1a/train_whole_combined_targets_sub3 + +. ./cmd.sh +if [ -f ./path.sh ]; then . ./path.sh; fi +. ./utils/parse_options.sh + +set -o pipefail +set -u + +if [ -z "$dir" ]; then + dir=exp/segmentation_1a/tdnn_lstm_asr_sad +fi +dir=$dir${affix:+_$affix} + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=`feat-to-dim scp:$data_dir/feats.scp -` name=input + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2) affine-transform-file=$dir/configs/lda.mat + + relu-renorm-layer name=tdnn1 input=lda dim=$relu_dim add-log-stddev=true + relu-renorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$relu_dim add-log-stddev=true + relu-renorm-layer name=tdnn3 input=Append(-3,0,3,6) dim=$relu_dim add-log-stddev=true + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim decay-time=20 delay=-3 dropout-proportion=0.0 + relu-renorm-layer name=tdnn4 input=Append(-6,0,6,12) add-log-stddev=true dim=$relu_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim decay-time=20 delay=-3 dropout-proportion=0.0 + relu-renorm-layer name=tdnn5 input=Append(-12,0,12,24) dim=$relu_dim + + output-layer name=output include-log-softmax=true dim=3 learning-rate-factor=0.1 input=tdnn5 +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig \ + --config-dir $dir/configs/ + + cat <> $dir/configs/vars +num_targets=3 +EOF +fi + +if [ $stage -le 6 ]; then + num_utts=`cat $data_dir/utt2spk | wc -l` + # Set num_utts_subset for diagnostics to a reasonable value + # of max(min(0.005 * num_utts, 300), 12) + num_utts_subset=`perl -e '$n=int($ARGV[0] * 0.005); print ($n > 300 ? 300 : ($n < 12 ? 12 : $n))' $num_utts` + + steps/nnet3/train_raw_rnn.py --stage=$train_stage \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --egs.chunk-width=$chunk_width \ + --egs.dir="$egs_dir" --egs.stage=$get_egs_stage \ + --egs.chunk-left-context=$extra_left_context \ + --egs.chunk-right-context=$extra_right_context \ + --egs.chunk-left-context-initial=0 \ + --egs.chunk-right-context-final=0 \ + --trainer.num-epochs=$num_epochs \ + --trainer.samples-per-iter=20000 \ + --trainer.optimization.num-jobs-initial=$num_jobs_initial \ + --trainer.optimization.num-jobs-final=$num_jobs_final \ + --trainer.optimization.initial-effective-lrate=$initial_effective_lrate \ + --trainer.optimization.final-effective-lrate=$final_effective_lrate \ + --trainer.optimization.shrink-value=0.99 \ + --trainer.dropout-schedule="$dropout_schedule" \ + --trainer.rnn.num-chunk-per-minibatch=128,64 \ + --trainer.optimization.momentum=0.5 \ + --trainer.deriv-truncate-margin=10 \ + --trainer.max-param-change=$max_param_change \ + --trainer.compute-per-dim-accuracy=true \ + --cmd="$decode_cmd" --nj $nj \ + --cleanup=true \ + --cleanup.remove-egs=$remove_egs \ + --cleanup.preserve-model-interval=10 \ + --use-gpu=true \ + --use-dense-targets=true \ + --feat-dir=$data_dir \ + --targets-scp="$targets_dir/targets.scp" \ + --egs.opts="--frame-subsampling-factor 3 --num-utts-subset $num_utts_subset" \ + --dir=$dir || exit 1 +fi + +if [ $stage -le 7 ]; then + # Use a subset to compute prior over the output targets + $train_cmd $dir/log/get_priors.log \ + matrix-sum-rows "scp:utils/subset_scp.pl --quiet 1000 $targets_dir/targets.scp |" \ + ark:- \| vector-sum --binary=false ark:- $dir/post_output.vec || exit 1 + + echo 3 > $dir/frame_subsampling_factor +fi diff --git a/egs/aspire/s5/local/segmentation/tuning/train_stats_asr_sad_1a.sh b/egs/aspire/s5/local/segmentation/tuning/train_stats_asr_sad_1a.sh new file mode 100755 index 00000000000..80f9840f160 --- /dev/null +++ b/egs/aspire/s5/local/segmentation/tuning/train_stats_asr_sad_1a.sh @@ -0,0 +1,136 @@ +#!/bin/bash + +# Copyright 2017 Nagendra Kumar Goel +# 2018 Vimal Manohar +# Apache 2.0 + +# This is a script to train a TDNN for speech activity detection (SAD) +# using statistics pooling for long-context information. + +stage=0 +train_stage=-10 +get_egs_stage=-10 +egs_opts= + +chunk_width=20 + +# The context is chosen to be around 1 second long. The context at test time +# is expected to be around the same. +extra_left_context=79 +extra_right_context=21 + +relu_dim=256 + +# training options +num_epochs=1 +initial_effective_lrate=0.0003 +final_effective_lrate=0.00003 +num_jobs_initial=3 +num_jobs_final=8 +remove_egs=true +max_param_change=0.2 # Small max-param change for small network + +egs_dir= +nj=40 + +dir= +affix=1a2 + +data_dir=exp/segmentation_1a/train_whole_rvb_hires +targets_dir=exp/segmentation_1a/train_whole_combined_targets_sub3 + +. ./cmd.sh +if [ -f ./path.sh ]; then . ./path.sh; fi +. ./utils/parse_options.sh + +set -o pipefail +set -u + +if [ -z "$dir" ]; then + dir=exp/segmentation_1a/tdnn_stats_asr_sad +fi +dir=$dir${affix:+_$affix} + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=`feat-to-dim scp:$data_dir/feats.scp -` name=input + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2) affine-transform-file=$dir/configs/lda.mat + + relu-renorm-layer name=tdnn1 input=lda dim=$relu_dim add-log-stddev=true + relu-renorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$relu_dim add-log-stddev=true + relu-renorm-layer name=tdnn3 input=Append(-3,0,3,6) dim=$relu_dim add-log-stddev=true + stats-layer name=tdnn3_stats config=mean+count(-99:3:9:99) + relu-renorm-layer name=tdnn4 input=Append(tdnn3@-6,tdnn3@0,tdnn3@6,tdnn3@12,tdnn3_stats) add-log-stddev=true dim=$relu_dim + stats-layer name=tdnn4_stats config=mean+count(-108:6:18:108) + relu-renorm-layer name=tdnn5 input=Append(tdnn4@-12,tdnn4@0,tdnn4@12,tdnn4@24,tdnn4_stats) dim=$relu_dim + + output-layer name=output include-log-softmax=true dim=3 learning-rate-factor=0.1 input=tdnn5 +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig \ + --config-dir $dir/configs/ + + cat <> $dir/configs/vars +num_targets=3 +EOF +fi + +if [ $stage -le 6 ]; then + num_utts=`cat $data_dir/utt2spk | wc -l` + # Set num_utts_subset for diagnostics to a reasonable value + # of max(min(0.005 * num_utts, 300), 12) + num_utts_subset=`perl -e '$n=int($ARGV[0] * 0.005); print ($n > 300 ? 300 : ($n < 12 ? 12 : $n))' $num_utts` + + steps/nnet3/train_raw_rnn.py --stage=$train_stage \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --egs.chunk-width=$chunk_width \ + --egs.dir="$egs_dir" --egs.stage=$get_egs_stage \ + --egs.chunk-left-context=$extra_left_context \ + --egs.chunk-right-context=$extra_right_context \ + --egs.chunk-left-context-initial=0 \ + --egs.chunk-right-context-final=0 \ + --trainer.num-epochs=$num_epochs \ + --trainer.samples-per-iter=20000 \ + --trainer.optimization.num-jobs-initial=$num_jobs_initial \ + --trainer.optimization.num-jobs-final=$num_jobs_final \ + --trainer.optimization.initial-effective-lrate=$initial_effective_lrate \ + --trainer.optimization.final-effective-lrate=$final_effective_lrate \ + --trainer.rnn.num-chunk-per-minibatch=128,64 \ + --trainer.optimization.momentum=0.5 \ + --trainer.deriv-truncate-margin=10 \ + --trainer.max-param-change=$max_param_change \ + --trainer.compute-per-dim-accuracy=true \ + --cmd="$decode_cmd" --nj $nj \ + --cleanup=true \ + --cleanup.remove-egs=$remove_egs \ + --cleanup.preserve-model-interval=10 \ + --use-gpu=true \ + --use-dense-targets=true \ + --feat-dir=$data_dir \ + --targets-scp="$targets_dir/targets.scp" \ + --egs.opts="--frame-subsampling-factor 3 --num-utts-subset $num_utts_subset" \ + --dir=$dir || exit 1 +fi + +if [ $stage -le 7 ]; then + # Use a subset to compute prior over the output targets + $train_cmd $dir/log/get_priors.log \ + matrix-sum-rows "scp:utils/subset_scp.pl --quiet 1000 $targets_dir/targets.scp |" \ + ark:- \| vector-sum --binary=false ark:- $dir/post_output.vec || exit 1 + + echo 3 > $dir/frame_subsampling_factor +fi diff --git a/egs/wsj/s5/steps/segmentation/convert_utt2spk_and_segments_to_rttm.py b/egs/wsj/s5/steps/segmentation/convert_utt2spk_and_segments_to_rttm.py index da9e655a404..e2a76d1a830 100755 --- a/egs/wsj/s5/steps/segmentation/convert_utt2spk_and_segments_to_rttm.py +++ b/egs/wsj/s5/steps/segmentation/convert_utt2spk_and_segments_to_rttm.py @@ -92,7 +92,7 @@ def main(): duration = float(parts[3]) - start_time print("SPEAKER {0} {1} {2:7.2f} {3:7.2f} " - " {4} \n".format( + " {4} ".format( file_id, channel, start_time, duration, spkr), file=rttm_writer) diff --git a/egs/wsj/s5/steps/segmentation/detect_speech_activity.sh b/egs/wsj/s5/steps/segmentation/detect_speech_activity.sh index 60e3df20df2..b1fda1d42d5 100755 --- a/egs/wsj/s5/steps/segmentation/detect_speech_activity.sh +++ b/egs/wsj/s5/steps/segmentation/detect_speech_activity.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#f!/bin/bash # Copyright 2016-17 Vimal Manohar # 2017 Nagendra Kumar Goel diff --git a/egs/wsj/s5/steps/segmentation/internal/get_transform_probs_mat.py b/egs/wsj/s5/steps/segmentation/internal/get_transform_probs_mat.py index dcf04d4cb3b..8a71e911c16 100755 --- a/egs/wsj/s5/steps/segmentation/internal/get_transform_probs_mat.py +++ b/egs/wsj/s5/steps/segmentation/internal/get_transform_probs_mat.py @@ -38,8 +38,9 @@ def get_args(): help="The fraction of garbage probability " "to add to silence") parser.add_argument("--sil-scale", type=float, - default=1.0, - help="Scale on the silence probability (make this more that one to encourage decoding silence).") + default=1.0, help="""Scale on the silence probability + (make this more than one to encourage + decoding silence).""") args = parser.parse_args() @@ -47,7 +48,7 @@ def get_args(): def run(args): - priors = [[1, 1, 1]] + priors = [[1.0, 1.0, 1.0]] if args.priors is not None: priors = common_lib.read_matrix_ascii(args.priors) if len(priors) != 0 and len(priors[0]) != 3: diff --git a/egs/wsj/s5/steps/segmentation/internal/prepare_sad_graph.py b/egs/wsj/s5/steps/segmentation/internal/prepare_sad_graph.py index fc0aea3eb12..12c9bb1e902 100755 --- a/egs/wsj/s5/steps/segmentation/internal/prepare_sad_graph.py +++ b/egs/wsj/s5/steps/segmentation/internal/prepare_sad_graph.py @@ -38,7 +38,8 @@ def get_args(): and minimum silence duration constraint. The graph is written to the 'output_graph', which can be file or "-" for stdout. for segmentation with minimum and maximum speech duration constraints and minimum silence - duration constraint.""") + duration constraint.""", + formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("--transition-scale", type=float, default=1.0, help="""Scale on transition probabilities relative to From ae22eece2f90d9d2d7ed4a37f7d92e17ca7b063e Mon Sep 17 00:00:00 2001 From: Pegita Date: Thu, 22 Feb 2018 16:40:51 -0500 Subject: [PATCH 125/174] fixed issues w.r.t comments (part 1). --- src/chain/chain-training.cc | 3 +- src/chain/chain-training.h | 6 +- src/chainbin/nnet3-chain-get-egs-post.cc | 397 +++++++++++++++++++++++ src/nnet3/nnet-chain-training.cc | 8 +- src/nnet3/nnet-example-utils.cc | 41 +-- src/nnet3/nnet-example-utils.h | 16 +- 6 files changed, 413 insertions(+), 58 deletions(-) create mode 100644 src/chainbin/nnet3-chain-get-egs-post.cc diff --git a/src/chain/chain-training.cc b/src/chain/chain-training.cc index 40108636da0..38c72efe057 100644 --- a/src/chain/chain-training.cc +++ b/src/chain/chain-training.cc @@ -29,6 +29,7 @@ void ComputeObjfAndDeriv2(const ChainTrainingOptions &opts, const DenominatorGraph &den_graph, const GeneralMatrix &supervision, const CuMatrixBase &nnet_output, + int32 num_sequences, int32 frames_per_sequence, BaseFloat *objf, BaseFloat *l2_term, BaseFloat *weight, @@ -46,8 +47,6 @@ void ComputeObjfAndDeriv2(const ChainTrainingOptions &opts, xent_output_deriv->SetZero(); xent_output_deriv->CopyFromMat(supervision.GetFullMatrix()); } - int32 num_sequences = 64, - frames_per_sequence = 150; BaseFloat sup_weight = 1.0; DenominatorComputation denominator(opts, den_graph, num_sequences, diff --git a/src/chain/chain-training.h b/src/chain/chain-training.h index 8c276a4854f..5b9f43e04e8 100644 --- a/src/chain/chain-training.h +++ b/src/chain/chain-training.h @@ -121,11 +121,15 @@ void ComputeChainObjfAndDeriv(const ChainTrainingOptions &opts, BaseFloat *weight, CuMatrixBase *nnet_output_deriv, CuMatrixBase *xent_output_deriv = NULL); - +/** + This function uses supervision as numerator and does denominator computation. + It can be uses, where numerator is fixed e.g. TS learning. +*/ void ComputeObjfAndDeriv2(const ChainTrainingOptions &opts, const DenominatorGraph &den_graph, const GeneralMatrix &supervision, const CuMatrixBase &nnet_output, + int32 num_sequences, int32 frames_per_sequence, BaseFloat *objf, BaseFloat *l2_term, BaseFloat *weight, diff --git a/src/chainbin/nnet3-chain-get-egs-post.cc b/src/chainbin/nnet3-chain-get-egs-post.cc new file mode 100644 index 00000000000..9aa0eba0fb8 --- /dev/null +++ b/src/chainbin/nnet3-chain-get-egs-post.cc @@ -0,0 +1,397 @@ +// chainbin/nnet3-chain-get-egs.cc + +// Copyright 2015 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "base/kaldi-common.h" +#include "util/common-utils.h" +#include "hmm/transition-model.h" +#include "hmm/posterior.h" +#include "nnet3/nnet-example.h" +#include "nnet3/nnet-chain-example.h" +#include "nnet3/nnet-example-utils.h" +#include "lat/lattice-functions.h" +#include "chain/chain-supervision.h" + +namespace kaldi { +namespace nnet3 { + +/** This function scales weights for fst. +*/ +void ScaleFst(BaseFloat scale, + fst::StdVectorFst *fst) { + typedef fst::StdArc StdArc; + typedef fst::StdArc::Weight Weight; + int32 num_states = fst->NumStates(); + for (int32 s = 0; s < num_states; s++) { + for (fst::MutableArcIterator iter(fst, s); + !iter.Done(); iter.Next()) { + StdArc arc = iter.Value(); + BaseFloat scaled_weight = scale * iter.Value().weight.Value(); + //arc.weight.SetWeight(scaled_weight); + arc.weight = scaled_weight; + iter.SetValue(arc); + } + Weight final_weight = fst->Final(s); + //if (final_weight != Weight::Zero()) + // scale = 1.0; + fst->SetFinal(s, final_weight); + } +} + +/** This function converts lattice to fst with weight equel to weighted + average of acoustic and language score. +*/ +void ConvertLatticeToPdfLabels( + const TransitionModel &tmodel, + const Lattice &ifst, + fst::StdVectorFst *ofst) { + typedef fst::ArcTpl ArcIn; + typedef fst::StdArc ArcOut; + typedef ArcIn::StateId StateId; + ofst->DeleteStates(); + // The states will be numbered exactly the same as the original FST. + // Add the states to the new FST. + StateId num_states = ifst.NumStates(); + for (StateId s = 0; s < num_states; s++) { + StateId news = ofst->AddState(); + assert(news == s); + } + ofst->SetStart(ifst.Start()); + for (StateId s = 0; s < num_states; s++) { + LatticeWeight final_iweight = ifst.Final(s); + if (final_iweight != LatticeWeight::Zero()) { + fst::TropicalWeight final_oweight; + ConvertLatticeWeight(final_iweight, &final_oweight); + ofst->SetFinal(s, final_oweight); + } + for (fst::ArcIterator iter(ifst, s); + !iter.Done(); + iter.Next()) { + ArcIn arc = iter.Value(); + KALDI_PARANOID_ASSERT(arc.weight != LatticeWeight::Zero()); + ArcOut oarc; + ConvertLatticeWeight(arc.weight, &oarc.weight); + oarc.ilabel = tmodel.TransitionIdToPdf(arc.ilabel) + 1; + oarc.olabel = tmodel.TransitionIdToPdf(arc.ilabel) + 1; + oarc.nextstate = arc.nextstate; + ofst->AddArc(s, oarc); + } + } +} + + +/** + This function does all the processing for one utterance, and outputs the + supervision objects to 'example_writer'. Note: if normalization_fst is the + empty FST (with no states), it skips the final stage of egs preparation and + you should do it later with nnet3-chain-normalize-egs. +*/ + +static bool ProcessFile(const fst::StdVectorFst &normalization_fst, + const GeneralMatrix &feats, + const MatrixBase *ivector_feats, + int32 ivector_period, + const Lattice &lat, + int32 num_output_frames, + const std::string &utt_id, + bool compress, + int32 num_pdfs, + TransitionModel &tmodel, + UtteranceSplitter *utt_splitter, + NnetExampleWriter *example_writer) { + //KALDI_ASSERT(supervision.num_sequences == 1); + int32 num_input_frames = feats.NumRows(); + + if (!utt_splitter->LengthsMatch(utt_id, num_input_frames, num_output_frames)) + return false; // LengthsMatch() will have printed a warning. + + std::vector chunks; + + utt_splitter->GetChunksForUtterance(num_input_frames, &chunks); + + if (chunks.empty()) { + KALDI_WARN << "Not producing egs for utterance " << utt_id + << " because it is too short: " + << num_input_frames << " frames."; + return false; + } + + int32 frame_subsampling_factor = utt_splitter->Config().frame_subsampling_factor; + + fst::StdVectorFst sup_fst, + scaled_normalization_fst(normalization_fst); + ConvertLatticeToPdfLabels(tmodel, lat, &sup_fst); + ScaleFst(0.5, &scaled_normalization_fst); // Scale lattice to have weights similar + // to weights used to combine lm weight + // with acoustic weight in sup_lat + if (normalization_fst.NumStates() > 0 && + !chain::AddWeightToFst(normalization_fst, &sup_fst)) { + KALDI_WARN << "For utterance " << utt_id << ", feature frames " + << ", FST was empty after composing with normalization FST. " + << "This should be extremely rare (a few per corpus, at most)"; + } + + // Convert fst to lattice to extract posterior using forward backward. + Lattice sup_lat; + ConvertFstToLattice(sup_fst, &sup_lat); + Posterior pdf_post; + LatticeForwardBackward(lat, &pdf_post); + + for (size_t c = 0; c < chunks.size(); c++) { + ChunkTimeInfo &chunk = chunks[c]; + + int32 start_frame_subsampled = chunk.first_frame / frame_subsampling_factor, + num_frames_subsampled = chunk.num_frames / frame_subsampling_factor; + + + // Do we need to substract 1 from post to convert it back to pdf-id. + // Select subset of posterior correspond to subset of utts. + // select subset of pdf-ids + Posterior labels(num_frames_subsampled); + for (int i = 0; i < num_frames_subsampled; i++) { + int t = i + start_frame_subsampled; + if (t < pdf_post.size()) + labels[i] = pdf_post[t]; + //for (std::vector >::iterator + // iter = labels[i].begin(); iter ! labels[i].end(); ++iter) + // iter->second *= chunk.output_weights[i]; + } + + int32 first_frame = 0; // we shift the time-indexes of all these parts so + // that the supervised part starts from frame 0. + + SubVector output_weights( + &(chunk.output_weights[0]), + static_cast(chunk.output_weights.size())); + + NnetExample nnet_eg; + nnet_eg.io.push_back(NnetIo("output", num_pdfs, 0, labels)); + nnet_eg.io.resize(ivector_feats != NULL ? 3 : 2); + + int32 tot_input_frames = chunk.left_context + chunk.num_frames + + chunk.right_context, + start_frame = chunk.first_frame - chunk.left_context; + + GeneralMatrix input_frames; + ExtractRowRangeWithPadding(feats, start_frame, tot_input_frames, + &input_frames); + + NnetIo input_io("input", -chunk.left_context, input_frames); + nnet_eg.io[0].Swap(&input_io); + + if (ivector_feats != NULL) { + // if applicable, add the iVector feature. + // choose iVector from a random frame in the chunk + int32 ivector_frame = RandInt(start_frame, + start_frame + num_input_frames - 1), + ivector_frame_subsampled = ivector_frame / ivector_period; + if (ivector_frame_subsampled < 0) + ivector_frame_subsampled = 0; + if (ivector_frame_subsampled >= ivector_feats->NumRows()) + ivector_frame_subsampled = ivector_feats->NumRows() - 1; + Matrix ivector(1, ivector_feats->NumCols()); + ivector.Row(0).CopyFromVec(ivector_feats->Row(ivector_frame_subsampled)); + NnetIo ivector_io("ivector", 0, ivector); + nnet_eg.io[2].Swap(&ivector_io); + } + + if (compress) + nnet_eg.Compress(); + + std::ostringstream os; + os << utt_id << "-" << chunk.first_frame; + + std::string key = os.str(); // key is - + + example_writer->Write(key, nnet_eg); + } + return true; +} + +} // namespace nnet3 +} // namespace kaldi + +int main(int argc, char *argv[]) { + try { + using namespace kaldi; + using namespace kaldi::nnet3; + using namespace kaldi::chain; + typedef kaldi::int32 int32; + typedef kaldi::int64 int64; + + const char *usage = + "Get frame-by-frame examples of data for nnet3+chain neural network\n" + "training. This involves breaking up utterances into pieces of a\n" + "fixed size. \n" + "The input is lattice and it will transform into new lattice " + "with pdf labels. The it will compose with " + "and does forward backward to get posterior.\n" + "This egs generation can be used for teacher student learning setup \n" + "where the lattice extracted from teacher network.\n" + "Note: if is not supplied the egs will not be\n" + "ready for training; in that case they should later be processed\n" + "with nnet3-chain-normalize-egs\n" + "\n" + "Usage: nnet3-chain-get-egs [options] [] " + " \n" + "\n" + "An example [where $feats expands to the actual features]:\n" + "chain-get-supervision [args] | \\\n" + " nnet3-chain-get-egs --left-context=25 --right-context=9 --num-frames=20 dir/normalization.fst \\\n" + " \"$feats\" ark,s,cs:- ark:cegs.1.ark\n" + "Note: the --frame-subsampling-factor option must be the same as given to\n" + "chain-get-supervision.\n"; + + bool compress = true; + int32 length_tolerance = 100, online_ivector_period = 1; + + ExampleGenerationConfig eg_config; // controls num-frames, + // left/right-context, etc. + + int32 srand_seed = 0, num_pdfs = -1; + std::string online_ivector_rspecifier, + trans_model; + + ParseOptions po(usage); + po.Register("compress", &compress, "If true, write egs with input features " + "in compressed format (recommended). Update: this is now " + "only relevant if the features being read are un-compressed; " + "if already compressed, we keep we same compressed format when " + "dumping-egs."); + po.Register("ivectors", &online_ivector_rspecifier, "Alias for " + "--online-ivectors option, for back compatibility"); + po.Register("online-ivectors", &online_ivector_rspecifier, "Rspecifier of " + "ivector features, as a matrix."); + po.Register("online-ivector-period", &online_ivector_period, "Number of " + "frames between iVectors in matrices supplied to the " + "--online-ivectors option"); + po.Register("srand", &srand_seed, "Seed for random number generator "); + po.Register("length-tolerance", &length_tolerance, "Tolerance for " + "difference in num-frames between feat and ivector matrices"); + po.Register("num-pdfs", &num_pdfs, "Number of pdfs in the acoustic " + "model"); + po.Register("trans-model", &trans_model, + "Transition model"); + + eg_config.Register(&po); + + po.Read(argc, argv); + + srand(srand_seed); + + if (po.NumArgs() < 3 || po.NumArgs() > 4) { + po.PrintUsage(); + exit(1); + } + + if (num_pdfs <= 0) + KALDI_ERR << "--num-pdfs options is required."; + TransitionModel tmodel; + if (!trans_model.empty()) + ReadKaldiObject(trans_model, &tmodel); + + std::string + normalization_fst_rxfilename, + feature_rspecifier, + lattice_rspecifier, + examples_wspecifier; + if (po.NumArgs() == 3) { + feature_rspecifier = po.GetArg(1); + lattice_rspecifier = po.GetArg(2); + examples_wspecifier = po.GetArg(3); + } else { + normalization_fst_rxfilename = po.GetArg(1); + KALDI_ASSERT(!normalization_fst_rxfilename.empty()); + feature_rspecifier = po.GetArg(2); + lattice_rspecifier = po.GetArg(3); + examples_wspecifier = po.GetArg(4); + } + + eg_config.ComputeDerived(); + UtteranceSplitter utt_splitter(eg_config); + + fst::StdVectorFst normalization_fst; + if (!normalization_fst_rxfilename.empty()) { + ReadFstKaldi(normalization_fst_rxfilename, &normalization_fst); + KALDI_ASSERT(normalization_fst.NumStates() > 0); + } + + // Read as GeneralMatrix so we don't need to un-compress and re-compress + // when selecting parts of matrices. + SequentialGeneralMatrixReader feat_reader(feature_rspecifier); + //chain::RandomAccessSupervisionReader supervision_reader( + // supervision_rspecifier); + RandomAccessLatticeReader lattice_reader(lattice_rspecifier); + NnetExampleWriter example_writer(examples_wspecifier); + RandomAccessBaseFloatMatrixReader online_ivector_reader( + online_ivector_rspecifier); + + int32 num_err = 0; + + for (; !feat_reader.Done(); feat_reader.Next()) { + std::string key = feat_reader.Key(); + const GeneralMatrix &feats = feat_reader.Value(); + if (!lattice_reader.HasKey(key)) { + KALDI_WARN << "No pdf-level posterior for key " << key; + num_err++; + } else { + //const chain::Supervision &supervision = supervision_reader.Value(key); + const Lattice &lat = lattice_reader.Value(key); + const Matrix *online_ivector_feats = NULL; + if (!online_ivector_rspecifier.empty()) { + if (!online_ivector_reader.HasKey(key)) { + KALDI_WARN << "No iVectors for utterance " << key; + num_err++; + continue; + } else { + // this address will be valid until we call HasKey() or Value() + // again. + online_ivector_feats = &(online_ivector_reader.Value(key)); + } + } + if (online_ivector_feats != NULL && + (abs(feats.NumRows() - (online_ivector_feats->NumRows() * + online_ivector_period)) > length_tolerance + || online_ivector_feats->NumRows() == 0)) { + KALDI_WARN << "Length difference between feats " << feats.NumRows() + << " and iVectors " << online_ivector_feats->NumRows() + << "exceeds tolerance " << length_tolerance; + num_err++; + continue; + } + int32 num_output_frames = 1; + if (!ProcessFile(normalization_fst, feats, + online_ivector_feats, online_ivector_period, + lat, num_output_frames, key, compress, num_pdfs, + tmodel, + &utt_splitter, &example_writer)) + num_err++; + } + } + if (num_err > 0) + KALDI_WARN << num_err << " utterances had errors and could " + "not be processed."; + // utt_splitter prints stats in its destructor. + return utt_splitter.ExitStatus(); + } catch(const std::exception &e) { + std::cerr << e.what() << '\n'; + return -1; + } +} diff --git a/src/nnet3/nnet-chain-training.cc b/src/nnet3/nnet-chain-training.cc index 4c799ea96c3..7608aea831e 100644 --- a/src/nnet3/nnet-chain-training.cc +++ b/src/nnet3/nnet-chain-training.cc @@ -63,9 +63,8 @@ void NnetChainTrainer::Train(const NnetExample &eg) { bool use_xent_regularization = (opts_.chain_config.xent_regularize != 0.0); ComputationRequest request; GetComputationRequest(*nnet_, eg, need_model_derivative, - nnet_config.store_component_stats, - use_xent_regularization, need_model_derivative, - &request); + nnet_config.store_component_stats, &request, + use_xent_regularization, need_model_derivative); const NnetComputation *computation = compiler_.Compile(request); // conventional training @@ -242,10 +241,11 @@ void NnetChainTrainer::ProcessOutputs(const NnetExample &eg, kUndefined); BaseFloat tot_objf, tot_l2_term, tot_weight; - + int32 num_sequences = 64, frames_per_sequence = 150; ComputeObjfAndDeriv2(opts_.chain_config, den_graph_, io.features, nnet_output, + num_sequences, frames_per_sequence, &tot_objf, &tot_l2_term, &tot_weight, &nnet_output_deriv, (use_xent ? &xent_deriv : NULL)); diff --git a/src/nnet3/nnet-example-utils.cc b/src/nnet3/nnet-example-utils.cc index 2151e06bbb4..62fc88521bc 100644 --- a/src/nnet3/nnet-example-utils.cc +++ b/src/nnet3/nnet-example-utils.cc @@ -202,13 +202,13 @@ void GetComputationRequest(const Nnet &nnet, const NnetExample &eg, bool need_model_derivative, bool store_component_stats, + ComputationRequest *request, bool use_xent_regularization, - bool use_xent_derivative, - ComputationRequest *request) { + bool use_xent_derivative) { request->inputs.clear(); request->inputs.reserve(eg.io.size()); request->outputs.clear(); - request->outputs.reserve(eg.io.size() * 2); + request->outputs.reserve((use_xent_regularization ? 2 : 1) * eg.io.size()); request->need_model_derivative = need_model_derivative; request->store_component_stats = store_component_stats; for (size_t i = 0; i < eg.io.size(); i++) { @@ -247,41 +247,6 @@ void GetComputationRequest(const Nnet &nnet, KALDI_ERR << "No outputs in computation request."; } -void GetComputationRequest(const Nnet &nnet, - const NnetExample &eg, - bool need_model_derivative, - bool store_component_stats, - ComputationRequest *request) { - request->inputs.clear(); - request->inputs.reserve(eg.io.size()); - request->outputs.clear(); - request->outputs.reserve(eg.io.size() * 2); - request->need_model_derivative = need_model_derivative; - request->store_component_stats = store_component_stats; - for (size_t i = 0; i < eg.io.size(); i++) { - const NnetIo &io = eg.io[i]; - const std::string &name = io.name; - int32 node_index = nnet.GetNodeIndex(name); - if (node_index == -1 && - !nnet.IsInputNode(node_index) && !nnet.IsOutputNode(node_index)) - KALDI_ERR << "Nnet example has input or output named '" << name - << "', but no such input or output node is in the network."; - - std::vector &dest = - nnet.IsInputNode(node_index) ? request->inputs : request->outputs; - dest.resize(dest.size() + 1); - IoSpecification &io_spec = dest.back(); - io_spec.name = name; - io_spec.indexes = io.indexes; - io_spec.has_deriv = nnet.IsOutputNode(node_index) && need_model_derivative; - } - // check to see if something went wrong. - if (request->inputs.empty()) - KALDI_ERR << "No inputs in computation request."; - if (request->outputs.empty()) - KALDI_ERR << "No outputs in computation request."; -} - void WriteVectorAsChar(std::ostream &os, bool binary, const VectorBase &vec) { diff --git a/src/nnet3/nnet-example-utils.h b/src/nnet3/nnet-example-utils.h index 05f35fb44de..5f6c69f7d96 100644 --- a/src/nnet3/nnet-example-utils.h +++ b/src/nnet3/nnet-example-utils.h @@ -56,16 +56,6 @@ void ShiftExampleTimes(int32 t_offset, inputs; if you do, you can create/modify the ComputationRequest manually. Assumes that if need_model_derivative is true, you will be supplying derivatives w.r.t. all outputs. -*/ -void GetComputationRequest(const Nnet &nnet, - const NnetExample &eg, - bool need_model_derivative, - bool store_component_stats, - ComputationRequest *computation_request); - - -/** This function takes NnetExample and produces a ComputatioRequest. - It assumes you don't want the derivatives w.r.t the input; If use_xent_regularization == true, then it assumes that for each output name (e.g. "output" in the eg, there is another output with the same @@ -79,9 +69,9 @@ void GetComputationRequest(const Nnet &nnet, const NnetExample &eg, bool need_model_derivative, bool store_component_stats, - bool use_xent_regularization, - bool use_xent_derivative, - ComputationRequest *computation_request); + ComputationRequest *computation_request, + bool use_xent_regularization = false, + bool use_xent_derivative = false); // Writes as unsigned char a vector 'vec' that is required to have // values between 0 and 1. From 7ac09a311c06b201cde95b26d403f562f5f29a44 Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Sun, 25 Feb 2018 19:25:50 -0500 Subject: [PATCH 126/174] Adding ML and separate MMI factors --- .../generate_uniformly_segmented_data_dir.sh | 60 +-- egs/aspire/s5/local/nnet3/prep_test_aspire.sh | 2 - .../s5/local/nnet3/run_ivector_common.sh | 2 +- .../s5/local/chain/compare_wer_general.py | 93 ++-- .../semisup/chain/tuning/run_tdnn_100k_d.sh | 87 +++- ...nn_lstm_100k_250k_semisupervised_conf_c.sh | 5 +- .../chain/tuning/run_tdnn_lstm_100k_smbr_a.sh | 4 +- egs/fisher_english/s5/path.sh | 2 + .../steps/libs/nnet3/xconfig/basic_layers.py | 19 +- egs/wsj/s5/steps/nnet2/remove_egs.sh | 8 +- egs/wsj/s5/steps/nnet3/chain/train.py | 11 + .../data/perturb_speed_to_allowed_lengths.py | 486 ++++++++++-------- src/chain/chain-denominator-smbr.cc | 141 +++-- src/chain/chain-denominator-smbr.h | 6 +- src/chain/chain-kernels-ansi.h | 6 +- src/chain/chain-smbr-kernels.cu | 38 +- src/chain/chain-training.cc | 70 +-- src/chain/chain-training.h | 11 +- 18 files changed, 638 insertions(+), 413 deletions(-) diff --git a/egs/aspire/s5/local/generate_uniformly_segmented_data_dir.sh b/egs/aspire/s5/local/generate_uniformly_segmented_data_dir.sh index 05f1752cda5..3c5df229bd6 100755 --- a/egs/aspire/s5/local/generate_uniformly_segmented_data_dir.sh +++ b/egs/aspire/s5/local/generate_uniformly_segmented_data_dir.sh @@ -32,51 +32,51 @@ data_set=$1 if [ "$data_set" == "dev_aspire" ]; then if [ $stage -le 1 ]; then - echo "$0 : Creating the data dir with whole recordings without segmentation" + echo "$0: Creating the data dir with whole recordings without segmentation" # create a whole directory without the segments - unseg_dir=data/${data_set}_whole - src_dir=data/$data_set - mkdir -p $unseg_dir - echo "$0 : Creating the $unseg_dir/wav.scp file" - cp $src_dir/wav.scp $unseg_dir + unseg_dir=data/${data_set}_whole_hires + src_dir=data/${data_set} + utils/data/convert_data_dir_to_whole.sh $src_dir $unseg_dir - echo "$0 : Creating the $unseg_dir/reco2file_and_channel file" + echo "$0: Creating the $unseg_dir/reco2file_and_channel file" cat $unseg_dir/wav.scp | awk '{print $1, $1, "A";}' > $unseg_dir/reco2file_and_channel - cat $unseg_dir/wav.scp | awk '{print $1, $1;}' > $unseg_dir/utt2spk - utils/utt2spk_to_spk2utt.pl $unseg_dir/utt2spk > $unseg_dir/spk2utt - fi data_set=${data_set}_whole +else + utils/copy_data_dir.sh data/$data_set data/${data_set}_hires fi -segmented_data_set=${data_set}_uniformsegmented_win${window}_over${overlap} if [ $stage -le 2 ]; then - echo "$0 : Generating uniform segments with length $window and overlap $overlap." - [ -d data/$segmented_data_set ] && rm -r data/$segmented_data_set - utils/copy_data_dir.sh --validate-opts "--no-text" \ - data/$data_set data/$segmented_data_set - cp data/$data_set/reco2file_and_channel data/$segmented_data_set - - local/multi_condition/create_uniform_segments.py \ - --overlap $overlap --window $window data/$segmented_data_set - - for file in cmvn.scp feats.scp; do - rm -f data/$segmented_data_set/$file - done - utils/validate_data_dir.sh --no-text --no-feats data/$segmented_data_set + echo "$0: Extracting features" + steps/make_mfcc.sh --cmd "$train_cmd" --nj $num_jobs \ + --mfcc-config conf/mfcc_hires.conf data/${data_set}_hires + + steps/compute_cmvn_stats.sh data/${data_set}_hires + + utils/fix_data_dir.sh data/${data_set}_hires + utils/validate_data_dir.sh --no-text data/${data_set}_hires fi +segmented_data_set=${data_set}_uniformsegmented_win${window}_over${overlap} if [ $stage -le 3 ]; then - echo "$0 : Extracting features for the uniformly segmented dir" + echo "$0 : Generating uniform segments with length $window and overlap $overlap." [ -d data/${segmented_data_set}_hires ] && rm -r data/${segmented_data_set}_hires - utils/copy_data_dir.sh --validate-opts "--no-text " \ - data/${segmented_data_set} data/${segmented_data_set}_hires + if [ ! -f data/${data_set}_hires/segments ]; then + utils/data/get_segments_for_data.sh data/${data_set}_hires > \ + data/${data_set}_hires/segments + fi - steps/make_mfcc.sh --cmd "$train_cmd" --nj $num_jobs \ - --mfcc-config conf/mfcc_hires.conf data/${segmented_data_set}_hires + mkdir -p data/${segmented_data_set}_hires + + utils/data/get_uniform_subsegments.py \ + --max-segment-duration=$window \ + --overlap-duration=$overlap \ + --max-remaining-duration=$(perl -e "print $window/ 2.0") \ + data/${data_set}_hires/segments > data/${segmented_data_set}_hires/sub_segments + utils/data/subsegment_data_dir.sh data/${data_set}_hires \ + data/${segmented_data_set}_hires/sub_segments data/${segmented_data_set}_hires steps/compute_cmvn_stats.sh data/${segmented_data_set}_hires - utils/fix_data_dir.sh data/${segmented_data_set}_hires utils/validate_data_dir.sh --no-text data/${segmented_data_set}_hires fi diff --git a/egs/aspire/s5/local/nnet3/prep_test_aspire.sh b/egs/aspire/s5/local/nnet3/prep_test_aspire.sh index cb69aaff10b..12e95516718 100755 --- a/egs/aspire/s5/local/nnet3/prep_test_aspire.sh +++ b/egs/aspire/s5/local/nnet3/prep_test_aspire.sh @@ -186,8 +186,6 @@ if [ $stage -le 8 ]; then --ctm-beam 6 \ --iter $iter \ --decode-mbr true \ - --window $window \ - --overlap $overlap \ --tune-hyper true \ $lang $decode_dir $act_data_set $segmented_data_set $out_file fi diff --git a/egs/aspire/s5/local/nnet3/run_ivector_common.sh b/egs/aspire/s5/local/nnet3/run_ivector_common.sh index 2df1bf8106c..e8d3dc4a609 100755 --- a/egs/aspire/s5/local/nnet3/run_ivector_common.sh +++ b/egs/aspire/s5/local/nnet3/run_ivector_common.sh @@ -116,7 +116,7 @@ if [ $stage -le 5 ]; then fi if [ $stage -le 6 ]; then - ivectordir=exp/nnet3/ivectors_train + ivectordir=exp/nnet3/ivectors_train_rvb if [[ $(hostname -f) == *.clsp.jhu.edu ]]; then # this shows how you can split across multiple file-systems. utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/ivectors/aspire/s5/$ivectordir/storage $ivectordir/storage fi diff --git a/egs/fisher_english/s5/local/chain/compare_wer_general.py b/egs/fisher_english/s5/local/chain/compare_wer_general.py index e3a2dc5417a..e6dc33779eb 100755 --- a/egs/fisher_english/s5/local/chain/compare_wer_general.py +++ b/egs/fisher_english/s5/local/chain/compare_wer_general.py @@ -30,6 +30,8 @@ def get_args(): help="Used to include looped results") parser.add_argument("--field-size", type=int, help="Field size for the models") + parser.add_argument("--outputs", type=str, default="output", + help="Comma separated list of output-names") parser.add_argument("systems", nargs='+') args = parser.parse_args() @@ -69,28 +71,29 @@ def __init__(self, dir_name, suffix, model_name): else: used_epochs = False - self.probs = [] + self.probs = defaultdict(list) self.wers = defaultdict(lambda: "NA") self.ins = defaultdict(lambda: "NA") self.dels = defaultdict(lambda: "NA") self.sub = defaultdict(lambda: "NA") - def add_wer(self, dev_set, affix=""): + def add_wer(self, dev_set, is_looped=False): decode_name = dev_set + self.suffix out = common_lib.get_command_stdout( - "grep WER {dir_name}/decode{affix}_{decode_name}/wer* | utils/best_wer.sh" - "".format(dir_name=self.dir_name, affix=affix, - decode_name=decode_name), + "grep WER {dir_name}/decode*_{decode_name}/wer* | grep {looped_filter} looped | utils/best_wer.sh" + "".format(dir_name=self.dir_name, decode_name=decode_name, + looped_filter="-v" if not is_looped else ""), require_zero_status=False) + affix = "looped" if is_looped else "" if out != "" and len(out.split()) >= 2: self.wers[(dev_set, affix)] = out.split()[1] self.ins[(dev_set, affix)] = out.split()[6] self.dels[(dev_set, affix)] = out.split()[8] self.sub[(dev_set, affix)] = out.split()[10] - def _get_prob(self, set_="train", xent=False): + def _get_prob(self, output_name="output", set_="train", xent=False): if not os.path.exists( "{dir_name}/log/compute_prob_{set}.{iter}.log" @@ -112,18 +115,17 @@ def _get_prob(self, set_="train", xent=False): affix = "-xent" if xent else "" for line in lines: - if (bool(re.search(r"'output-0{0}'".format(affix), line)) - or bool(re.search(r"'output{0}'".format(affix), line))): + if bool(re.search(r"'{0}{1}'".format(output_name, affix), line)): prob = float(line.split()[7]) break return "NA" if prob is None else "{0:.4f}".format(prob) - def add_probs(self): - self.probs.append(self._get_prob(set_="train", xent=False)) - self.probs.append(self._get_prob(set_="valid", xent=False)) - self.probs.append(self._get_prob(set_="train", xent=True)) - self.probs.append(self._get_prob(set_="valid", xent=True)) + def add_probs(self, output_name="output"): + self.probs[output_name].append(self._get_prob(output_name=output_name, set_="train", xent=False)) + self.probs[output_name].append(self._get_prob(output_name=output_name, set_="valid", xent=False)) + self.probs[output_name].append(self._get_prob(output_name=output_name, set_="train", xent=True)) + self.probs[output_name].append(self._get_prob(output_name=output_name, set_="valid", xent=True)) def run(args): @@ -142,10 +144,11 @@ def run(args): info.add_wer(dev_set) if args.include_looped: - info.add_wer(dev_set, affix="_looped") + info.add_wer(dev_set, is_looped=True) if not used_epochs: - info.add_probs() + for output_name in args.outputs.split(','): + info.add_probs(output_name) systems.append(info) @@ -154,13 +157,14 @@ def run(args): def print_system_infos(args, system_infos, used_epochs=False): field_sizes = [args.field_size] * len(system_infos) + output_names = args.outputs.split(",") if args.field_size is None: for i, x in enumerate(system_infos): field_sizes[i] = len(x.model_name) separator = args.separator - print ("# {0: <25}{sep}{1}".format( + print ("# {0: <35}{sep}{1}".format( "System", "{sep}".format(sep=args.separator).join( ["{0: <{1}}".format(x.model_name, field_sizes[i]) @@ -174,7 +178,7 @@ def print_system_infos(args, system_infos, used_epochs=False): for tup in sorted(list(tups)): dev_set, affix = tup - print ("# {0: <25}{sep}{1}".format( + print ("# {0: <35}{sep}{1}".format( "WER on {0} {1}" "".format(dev_set, "[ "+affix+" ]" if affix != "" else ""), "{sep}".format(sep=args.separator).join( @@ -182,21 +186,21 @@ def print_system_infos(args, system_infos, used_epochs=False): for i, x in enumerate(system_infos)]), sep=args.separator)) if args.print_fine_details: - print ("# {0: <25}{sep}{1}".format( + print ("# {0: <35}{sep}{1}".format( "#Ins on {0} {1}" "".format(dev_set, "[ "+affix+" ]" if affix != "" else ""), "{sep}".format(sep=args.separator).join( ["{0: <{1}}".format(x.ins[tup], field_sizes[i]) for i, x in enumerate(system_infos)]), sep=args.separator)) - print ("# {0: <25}{sep}{1}".format( + print ("# {0: <35}{sep}{1}".format( "#Del on {0} {1}" "".format(dev_set, "[ "+affix+" ]" if affix != "" else ""), "{sep}".format(sep=args.separator).join( ["{0: <{1}}".format(x.dels[tup], field_sizes[i]) for i, x in enumerate(system_infos)]), sep=args.separator)) - print ("# {0: <25}{sep}{1}".format( + print ("# {0: <35}{sep}{1}".format( "#Sub on {0} {1}" "".format(dev_set, "[ "+affix+" ]" if affix != "" else ""), "{sep}".format(sep=args.separator).join( @@ -205,33 +209,34 @@ def print_system_infos(args, system_infos, used_epochs=False): sep=args.separator)) if not used_epochs: - print ("# {0: <25}{sep}{1}".format( - "Final train prob", - "{sep}".format(sep=args.separator).join( - ["{0: <{1}}".format(x.probs[0], field_sizes[i]) - for i, x in enumerate(system_infos)]), - sep=args.separator)) + for output_name in output_names: + print ("# {0: <35}{sep}{1}".format( + "Final {0} train prob".format(output_name), + "{sep}".format(sep=args.separator).join( + ["{0: <{1}}".format(x.probs[output_name][0], field_sizes[i]) + for i, x in enumerate(system_infos)]), + sep=args.separator)) - print ("# {0: <25}{sep}{1}".format( - "Final valid prob", - "{sep}".format(sep=args.separator).join( - ["{0: <{1}}".format(x.probs[1], field_sizes[i]) - for i, x in enumerate(system_infos)]), - sep=args.separator)) + print ("# {0: <35}{sep}{1}".format( + "Final {0} valid prob".format(output_name), + "{sep}".format(sep=args.separator).join( + ["{0: <{1}}".format(x.probs[output_name][1], field_sizes[i]) + for i, x in enumerate(system_infos)]), + sep=args.separator)) - print ("# {0: <25}{sep}{1}".format( - "Final train prob (xent)", - "{sep}".format(sep=args.separator).join( - ["{0: <{1}}".format(x.probs[2], field_sizes[i]) - for i, x in enumerate(system_infos)]), - sep=args.separator)) + print ("# {0: <35}{sep}{1}".format( + "Final {0} train prob (xent)".format(output_name), + "{sep}".format(sep=args.separator).join( + ["{0: <{1}}".format(x.probs[output_name][2], field_sizes[i]) + for i, x in enumerate(system_infos)]), + sep=args.separator)) - print ("# {0: <25}{sep}{1}".format( - "Final valid prob (xent)", - "{sep}".format(sep=args.separator).join( - ["{0: <{1}}".format(x.probs[3], field_sizes[i]) - for i, x in enumerate(system_infos)]), - sep=args.separator)) + print ("# {0: <35}{sep}{1}".format( + "Final {0} valid prob (xent)".format(output_name), + "{sep}".format(sep=args.separator).join( + ["{0: <{1}}".format(x.probs[output_name][3], field_sizes[i]) + for i, x in enumerate(system_infos)]), + sep=args.separator)) if __name__ == "__main__": diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_d.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_d.sh index 569d4d0604e..6fe8cdececc 100755 --- a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_d.sh +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_d.sh @@ -26,6 +26,22 @@ remove_egs=false common_egs_dir= minibatch_size=128 +# smbr finetuning +do_smbr_finetuning=false + +finetune_num_extra_lm_states=2000 +finetune_stage=-1 # Set this lower to train den.fst +finetune_suffix=_smbr +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_l2_regularize=0.00005 +finetune_opts="--chain.mmi-factor-schedule=0.0,0.0 --chain.smbr-factor-schedule=1,1" +finetune_leaky_hmm_coefficient=0.001 +finetune_apply_deriv_weights=true +finetune_lr=0.000005 +chain_smbr_extra_opts= + # End configuration section. echo "$0 $@" # Print the command line for logging @@ -68,8 +84,6 @@ if [ $stage -le 9 ]; then rm $lat_dir/fsts.*.gz # save space fi -exit 1 - if [ $stage -le 10 ]; then # Create a version of the lang/ directory that has one state per phone in the # topo file. [note, it really has two states.. the first one is only repeated @@ -165,7 +179,7 @@ if [ $stage -le 13 ]; then --trainer.optimization.initial-effective-lrate 0.001 \ --trainer.optimization.final-effective-lrate 0.0001 \ --trainer.max-param-change 2.0 \ - --cleanup.remove-egs $remove_egs \ + --cleanup.remove-egs false \ --feat-dir $train_data_dir \ --tree-dir $treedir \ --lat-dir $lat_dir \ @@ -196,6 +210,71 @@ if [ $stage -le 15 ]; then ) & done fi + +if ! $do_smbr_finetuning; then + wait + exit 0; +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ ! -z "$common_egs_dir" ]; then + egs_dir=$common_egs_dir + else + egs_dir=$dir/egs + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.smbr-extra-opts="$chain_smbr_extra_opts" \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient $finetune_leaky_hmm_coefficient \ + --chain.l2-regularize $finetune_l2_regularize \ + --chain.apply-deriv-weights $finetune_apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=$finetune_num_extra_lm_states" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 160,140,110,80 \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate $finetune_lr \ + --trainer.optimization.final-effective-lrate $(perl -e "print $finetune_lr * 0.1") \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir $train_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir --lang $lang \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + wait; exit 0; - diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_250k_semisupervised_conf_c.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_250k_semisupervised_conf_c.sh index cca77616936..f41374e4593 100644 --- a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_250k_semisupervised_conf_c.sh +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_250k_semisupervised_conf_c.sh @@ -38,6 +38,7 @@ phone_insertion_penalty= comb_affix=comb_250k_1c # affix for new chain-model directory trained on the combined supervised+unsupervised subsets supervision_weights=1.0,1.0 lm_weights=3,2 +num_copies= sup_egs_dir= unsup_egs_dir= unsup_egs_opts= @@ -360,8 +361,8 @@ comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi if [ $stage -le 14 ]; then steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ --minibatch-size 64 --frames-per-iter 1500000 \ - --lang2weight $supervision_weights --egs-prefix cegs. 2 \ - $sup_egs_dir $unsup_egs_dir $comb_egs_dir + --lang2weight $supervision_weights --egs-prefix cegs. --lang2num-copies "$num_copies" \ + 2 $sup_egs_dir $unsup_egs_dir $comb_egs_dir touch $comb_egs_dir/.nodelete # keep egs around when that run dies. fi diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_smbr_a.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_smbr_a.sh index d704894baa6..1806303f319 100755 --- a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_smbr_a.sh +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_smbr_a.sh @@ -32,6 +32,7 @@ extra_opts="--chain.mmi-factor-schedule=1.0,1.0@0.1,0.5@0.2,0.5 --chain.smbr-fac chain_smbr_extra_opts= smbr_leaky_hmm_coefficient=0.00001 leaky_hmm_coefficient=0.1 +l2_regularize=0.0 # 00005 # decode options extra_left_context=50 @@ -167,7 +168,7 @@ if [ $stage -le 13 ]; then --chain.xent-regularize $xent_regularize \ --chain.leaky-hmm-coefficient $leaky_hmm_coefficient \ --chain.smbr-leaky-hmm-coefficient $smbr_leaky_hmm_coefficient \ - --chain.l2-regularize 0.0 \ + --chain.l2-regularize $l2_regularize \ --chain.apply-deriv-weights false \ --chain.lm-opts="--num-extra-lm-states=2000" \ --trainer.dropout-schedule $dropout_schedule \ @@ -195,6 +196,7 @@ if [ $stage -le 13 ]; then --tree-dir $treedir \ --lat-dir $lat_dir \ --chain.smbr-extra-opts="$chain_smbr_extra_opts" \ + --cleanup.preserve-model-interval 10 \ --dir $dir --lang $lang $extra_opts || exit 1; fi diff --git a/egs/fisher_english/s5/path.sh b/egs/fisher_english/s5/path.sh index 7cad3842ab3..84fff2ad735 100755 --- a/egs/fisher_english/s5/path.sh +++ b/egs/fisher_english/s5/path.sh @@ -5,3 +5,5 @@ export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH export PYTHONPATH=${PYTHONPATH:+$PYTHONPATH:}$KALDI_ROOT/tools/tensorflow_build/.local/lib/python2.7/site-packages export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$KALDI_ROOT/tools/tensorflow/bazel-bin/tensorflow/:/usr/local/cuda/lib64:/export/a11/hlyu/cudnn/lib64:/home/dpovey/libs/ export LC_ALL=C +. /etc/profile.d/modules.sh +module load shared cuda80/toolkit diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py index 762e1ccf29b..26599bdd8a0 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py @@ -497,9 +497,14 @@ def set_default_configs(self): def check_configs(self): - if self.config['offset-file'] != '' and self.config['dim'] <= -1: - raise RuntimeError("In output-layer, dim has invalid value {0}" - "".format(self.config['dim'])) + if self.config['offset-file'] == '': + if self.config['dim'] <= -1: + raise RuntimeError("In output-layer, dim has invalid value {0}" + "".format(self.config['dim'])) + if self.config['learning-rate-factor'] <= 0.0: + raise RuntimeError("In output-layer, learning-rate-factor has" + " invalid value {0}" + "".format(self.config['learning-rate-factor'])) if self.config['objective-type'] != 'linear' and \ self.config['objective-type'] != 'quadratic': @@ -507,10 +512,6 @@ def check_configs(self): " invalid value {0}" "".format(self.config['objective-type'])) - if self.config['learning-rate-factor'] <= 0.0: - raise RuntimeError("In output-layer, learning-rate-factor has" - " invalid value {0}" - "".format(self.config['learning-rate-factor'])) def auxiliary_outputs(self): @@ -645,12 +646,12 @@ def _generate_config(self): ''.format(self.name, self.config['offset-file'], max_change, ng_affine_options, learning_rate_option, l2_regularize_option)) - ans.append(line) + configs.append(line) line = ('component-node name={0}.offset' ' component={0}.offset input={1}' ''.format(self.name, descriptor_final_string)) - ans.append(line) + configs.append(line) cur_node = '{0}.offset'.format(self.name) if include_log_softmax: diff --git a/egs/wsj/s5/steps/nnet2/remove_egs.sh b/egs/wsj/s5/steps/nnet2/remove_egs.sh index f8e37d86a11..673a0c13993 100755 --- a/egs/wsj/s5/steps/nnet2/remove_egs.sh +++ b/egs/wsj/s5/steps/nnet2/remove_egs.sh @@ -10,6 +10,12 @@ # data that's linked to as well as the soft link), and we want to not # delete the examples if someone has done "touch $dir/egs/.nodelete". +force=false + +if [ $1 == "--force" ]; then + force=true + shift +fi if [ $# != 1 ]; then echo "Usage: $0 " @@ -28,7 +34,7 @@ if [ ! -d $egs ]; then exit 1; fi -if [ -f $egs/.nodelete ]; then +if ! $force && [ -f $egs/.nodelete ]; then echo "$0: not deleting egs in $egs since $egs/.nodelete exists" exit 0; fi diff --git a/egs/wsj/s5/steps/nnet3/chain/train.py b/egs/wsj/s5/steps/nnet3/chain/train.py index 841156f4e8a..d216a717d75 100755 --- a/egs/wsj/s5/steps/nnet3/chain/train.py +++ b/egs/wsj/s5/steps/nnet3/chain/train.py @@ -128,6 +128,10 @@ def get_args(): dest='mmi_factor_schedule', default=None, action=common_lib.NullstrToNoneAction, help="Schedule for MMI factor in LF-SMBR training.") + parser.add_argument("--chain.ml-factor-schedule", type=str, + dest='ml_factor_schedule', default=None, + action=common_lib.NullstrToNoneAction, + help="Schedule for ML factor in LF-SMBR training.") parser.add_argument("--chain.smbr-xent-regularize", default=None, dest='smbr_xent_regularize', type=float, help="Xent regularizer term used with sMBR training") @@ -600,6 +604,13 @@ def train(args, run_opts): objective_opts += " --mmi-factor={0}".format(mmi_factor) + if args.ml_factor_schedule is not None: + ml_factor = common_train_lib.get_schedule_value( + args.ml_factor_schedule, + float(num_archives_processed) / num_archives_to_process) + + objective_opts += " --ml-factor={0}".format(ml_factor) + objective_opts += " --norm-regularize={0}".format( "true" if args.norm_regularize else "false") diff --git a/egs/wsj/s5/utils/data/perturb_speed_to_allowed_lengths.py b/egs/wsj/s5/utils/data/perturb_speed_to_allowed_lengths.py index ee2c5def33d..efe59d0a3aa 100755 --- a/egs/wsj/s5/utils/data/perturb_speed_to_allowed_lengths.py +++ b/egs/wsj/s5/utils/data/perturb_speed_to_allowed_lengths.py @@ -1,11 +1,11 @@ #!/usr/bin/env python -# Copyright 2017 Johns Hopkins University (author: Hossein Hadian) +# Copyright 2017 Hossein Hadian # Apache 2.0 -""" This script perturbs speeds of utterances to force their lengths to some allowed - lengths spaced by a factor +""" This script perturbs speeds of utterances to force their lengths to some + allowed lengths spaced by a factor (like 10%) """ import argparse @@ -14,222 +14,288 @@ import copy import math -parser = argparse.ArgumentParser(description="""This script ...""") -parser.add_argument('factor', type=float, default=12, - help='spacing (in percentage) between allowed lengths.') -parser.add_argument('srcdir', type=str, - help='path to source data dir') -parser.add_argument('dir', type=str, help='output dir') -parser.add_argument('--range-factor', type=float, default=0.05, - help="""Percentage of durations not covered from each side of - duration histogram.""") -parser.add_argument('--no-speed-perturb', action='store_true') -parser.add_argument("--only-speed-perturb", action='store_true') - -args = parser.parse_args() - -### functions and classes ### - -class Speaker: - def __init__(self, path, sid): - self.path = path - self.name = os.path.basename(os.path.normpath(path)) - self.id = sid - self.utterances = [] - def str_id(self): - return "s" + zero_pad(str(self.id), 4) +def get_args(): + parser = argparse.ArgumentParser(description="""This script copies the 'srcdir' + data directory to output data directory 'dir' + while modifying the utterances so that there are + 3 copies of each utterance: one with the same + speed, one with a higher speed (not more than + factor% faster) and one with a lower speed + (not more than factor% slower)""") + parser.add_argument('factor', type=float, default=12, + help='Spacing (in percentage) between allowed lengths.') + parser.add_argument('srcdir', type=str, + help='path to source data dir') + parser.add_argument('dir', type=str, help='output dir') + parser.add_argument('--coverage-factor', type=float, default=0.05, + help="""Percentage of durations not covered from each + side of duration histogram.""") + parser.add_argument('--frame-shift', type=int, default=10, + help="""Frame shift in milliseconds.""") + parser.add_argument('--frame-length', type=int, default=25, + help="""Frame length in milliseconds.""") + parser.add_argument('--frame-subsampling-factor', type=int, default=3, + help="""Chain frame subsampling factor. + See steps/nnet3/chain/train.py""") + parser.add_argument('--speed-perturb', + type=str, choices=["true", "false"], default="true", + help="""If false, no speed perturbation will occur, i.e. + only 1 copy of each utterance will be + saved, which is modified to have an allowed length + by using extend-wav-with-silence.""") + args = parser.parse_args() + args.speed_perturb = True if args.speed_perturb == "true" else False + return args + class Utterance: - def __init__(self, uid, wavefile, speaker, transcription, dur): - self.wavefile = wavefile - self.speaker = speaker - self.transcription = transcription - self.id = uid - self.dur = float(dur) + """ This class represents a Kaldi utterance + in a data directory like data/train + """ + + def __init__(self, uid, wavefile, speaker, transcription, dur): + self.wavefile = (wavefile if wavefile.rstrip().endswith('|') else + 'cat {} |'.format(wavefile)) + self.speaker = speaker + self.transcription = transcription + self.id = uid + self.dur = float(dur) + + def to_kaldi_utt_str(self): + return self.id + " " + self.transcription - def to_kaldi_utt_str(self): - return self.id + " " + self.transcription + def to_kaldi_wave_str(self): + return self.id + " " + self.wavefile - def to_kaldi_wave_str(self): - return self.id + " " + self.wavefile + def to_kaldi_dur_str(self): + return "{} {:0.3f}".format(self.id, self.dur) def read_kaldi_datadir(dir): - utts = [] - wav_scp = read_kaldi_mapfile(os.path.join(dir, 'wav.scp')) - text = read_kaldi_mapfile(os.path.join(dir, 'text')) - utt2dur = read_kaldi_mapfile(os.path.join(dir, 'utt2dur')) - utt2spk = read_kaldi_mapfile(os.path.join(dir, 'utt2spk')) - for utt in wav_scp: - if utt in text and utt in utt2dur and utt in utt2spk: - utts += [Utterance(utt, wav_scp[utt], utt2spk[utt], text[utt], utt2dur[utt])] - else: - print('Incomplete data for utt {}'.format(utt)) - return utts + """ Read a data directory like + data/train as a list of utterances + """ + + utterances = [] + wav_scp = read_kaldi_mapfile(os.path.join(dir, 'wav.scp')) + text = read_kaldi_mapfile(os.path.join(dir, 'text')) + utt2dur = read_kaldi_mapfile(os.path.join(dir, 'utt2dur')) + utt2spk = read_kaldi_mapfile(os.path.join(dir, 'utt2spk')) + for utt in wav_scp: + if utt in text and utt in utt2dur and utt in utt2spk: + utterances.append(Utterance(utt, wav_scp[utt], utt2spk[utt], + text[utt], utt2dur[utt])) + else: + print('Warning: incomplete data for utt {}'.format(utt)) + return utterances def read_kaldi_mapfile(path): - m = {} - with open(path, 'r') as f: - for line in f: - line = line.rstrip() - sp_pos = line.find(' ') - key = line[:sp_pos] - val = line[sp_pos+1:] - m[key] = val - return m + """ Read any Kaldi mapping file - like text, .scp files, etc. + """ + + m = {} + with open(path, 'r') as f: + for line in f: + line = line.strip() + sp_pos = line.find(' ') + key = line[:sp_pos] + val = line[sp_pos+1:] + m[key] = val + return m def generate_kaldi_data_files(utterances, outdir): - print "Exporting to ", outdir, "..." - spks = {} - - f = open(os.path.join(outdir, 'text'), 'w') - for utt in utterances: - f.write(utt.to_kaldi_utt_str() + "\n") - f.close() - - f = open(os.path.join(outdir, 'wav.scp'), 'w') - for utt in utterances: - f.write(utt.to_kaldi_wave_str() + "\n") - f.close() - - f = open(os.path.join(outdir, 'utt2dur'), 'w') - for utt in utterances: - f.write(utt.id + " " + str(utt.dur) + "\n") - f.close() - - f = open(os.path.join(outdir, 'utt2spk'), 'w') - for utt in utterances: - f.write(utt.id + " " + utt.speaker + "\n") - if utt.speaker not in spks: - spks[utt.speaker] = [utt.id] - else: - spks[utt.speaker] += [utt.id] - f.close() - - f = open(os.path.join(outdir, 'spk2utt'), 'w') - for s in spks: - f.write(s + " ") - for utt in spks[s]: - f.write(utt + " ") - f.write('\n') - f.close() - - - - -### main ### - -if not os.path.exists(args.dir): - os.makedirs(args.dir) - -# 0. load src dir -utts = read_kaldi_datadir(args.srcdir) - -factor = 1.0 + float(args.factor)/100 -# 1a. find start-dur and end-dur -## echo "Durs = [" >durs.m && cut -d' ' -f2 data/train_nodup_seg/utt2dur | tr '\n' ',' >>durs.m && echo " ];" >>durs.m -durs = [] -for u in utts: - durs += [u.dur] -durs.sort() -to_ignore_dur = 0 -tot_dur = sum(durs) -for d in durs: - to_ignore_dur += d - if to_ignore_dur * 100.0 / tot_dur > args.range_factor: - start_dur = d - break -to_ignore_dur = 0 -for d in reversed(durs): - to_ignore_dur += d - if to_ignore_dur * 100.0 / tot_dur > args.range_factor: - end_dur = d - break -print("Durations in the range [{},{}] will be covered. Coverage rate: {}%".format(start_dur, end_dur, 100.0-args.range_factor*2)) -print("There will be {} unique allowed lengths for the utterances.".format(int(math.log(end_dur/start_dur)/math.log(factor)))) -#sys.exit(0) - -# 1b. compute and write allowed lengths -#start_dur = 0.88 -#end_dur = 19.00 -durs = [] -d = start_dur -f = open(os.path.join(args.dir, 'allowed_durs.txt'), 'wb') -f2 = open(os.path.join(args.dir, 'allowed_lengths.txt'), 'wb') -while d < end_dur: - length = int(d*1000 - 25) / 10 + 1 # for the most common length of frames and overlap - if length % 3 != 0: - lo = 3 * (length / 3) - hi = lo + 3 - #if length - lo <= hi - length: - # length = lo - #else: - # length = hi - length = lo # should select lo to make sure the jump is not bigger than 12% - dnew = (10.0 * (length - 1.0) + 25.0 + 5.0) / 1000.0 # +5 is for safety - d = dnew - durs += [d] - f.write(str(d) + '\n') - f2.write(str(length) + '\n') - d *= factor -f.close() -f2.close() - -# 2. perturb to allowed durs -# sox -t wav seg1.wav -t wav long95.wav speed 0.873684211 -perturbed_utts = [] -durs = durs + [1000000] -for u in utts: - prev_d = 0.0 - i = 0 - for d in durs: - if u.dur <= d and u.dur >= prev_d: - break - i += 1 - prev_d = d - # i determines the closest allowed durs - - allowed_dur = durs[i - 1] if i > 0 else durs[i] - speed = u.dur / allowed_dur - if max(speed, 1.0/speed) > factor: - #print('rejected: {} --> dur was {} speed was {}'.format(u.id, u.dur, speed)) - continue - u1 = copy.deepcopy(u) - prefix = 'pv1' if not args.only_speed_perturb else '' - u1.id = prefix + u.id - u1.speaker = prefix + u.speaker - parts = u.wavefile.split() - if len(parts) == 1: - u1.wavefile = 'wav-copy {0} - | sox -t wav - -t wav - speed {1} | '.format( - u.wavefile, speed) - else: - assert parts[-1] == "|" - u1.wavefile = '{0} sox -t wav - -t wav - speed {1} | '.format( - u.wavefile, speed) - u1.dur = allowed_dur - if not args.no_speed_perturb: - perturbed_utts += [u1] - - if args.only_speed_perturb: - continue - - delta = allowed_dur - u.dur - if delta <= 1e-4: - continue - u3 = copy.deepcopy(u) - prefix = 'pv3-' if not args.no_speed_perturb else '' - u3.id = prefix + u.id - u3.speaker = prefix + u.speaker - - parts = u.wavefile.split() - if len(parts) == 1: - u3.wavefile = 'extend-wav-with-silence --extra-silence-length={1} {0} - | '.format(u.wavefile, delta) - else: - assert parts[-1] == "|" - u3.wavefile = '{0} extend-wav-with-silence --extra-silence-length={1} - - | '.format(u.wavefile, delta) - u3.dur = allowed_dur2 - perturbed_utts += [u3] - -# 3. write to our dir -generate_kaldi_data_files(perturbed_utts, args.dir) + """ Write out a list of utterances as Kaldi data files into an + output data directory. + """ + + print("Exporting to {}...".format(outdir)) + speakers = {} + + with open(os.path.join(outdir, 'text'), 'w') as f: + for utt in utterances: + f.write(utt.to_kaldi_utt_str() + "\n") + + with open(os.path.join(outdir, 'wav.scp'), 'w') as f: + for utt in utterances: + f.write(utt.to_kaldi_wave_str() + "\n") + + with open(os.path.join(outdir, 'utt2dur'), 'w') as f: + for utt in utterances: + f.write(utt.to_kaldi_dur_str() + "\n") + + with open(os.path.join(outdir, 'utt2spk'), 'w') as f: + for utt in utterances: + f.write(utt.id + " " + utt.speaker + "\n") + if utt.speaker not in speakers: + speakers[utt.speaker] = [utt.id] + else: + speakers[utt.speaker].append(utt.id) + + with open(os.path.join(outdir, 'spk2utt'), 'w') as f: + for s in speakers: + f.write(s + " ") + for utt in speakers[s]: + f.write(utt + " ") + f.write('\n') + + +def find_duration_range(utterances, coverage_factor): + """Given a list of utterances, find the start and end duration to cover + + If we try to cover + all durations which occur in the training set, the number of + allowed lengths could become very large. + + Returns + ------- + start_dur: int + end_dur: int + """ + durs = [] + for u in utterances: + durs.append(u.dur) + durs.sort() + to_ignore_dur = 0 + tot_dur = sum(durs) + for d in durs: + to_ignore_dur += d + if to_ignore_dur * 100.0 / tot_dur > coverage_factor: + start_dur = d + break + to_ignore_dur = 0 + for d in reversed(durs): + to_ignore_dur += d + if to_ignore_dur * 100.0 / tot_dur > coverage_factor: + end_dur = d + break + if start_dur < 0.3: + start_dur = 0.3 # a hard limit to avoid too many allowed lengths --not critical + return start_dur, end_dur + + +def find_allowed_durations(start_dur, end_dur, args): + """Given the start and end duration, find a set of + allowed durations spaced by args.factor%. Also write + out the list of allowed durations and the corresponding + allowed lengths (in frames) on disk. + + Returns + ------- + allowed_durations: list of allowed durations (in seconds) + """ + + allowed_durations = [] + d = start_dur + with open(os.path.join(args.dir, 'allowed_durs.txt'), 'wb') as durs_fp, \ + open(os.path.join(args.dir, 'allowed_lengths.txt'), 'wb') as lengths_fp: + while d < end_dur: + length = int(d * 1000 - args.frame_length) / args.frame_shift + 1 + if length % args.frame_subsampling_factor != 0: + length = (args.frame_subsampling_factor * + (length // args.frame_subsampling_factor)) + d = (args.frame_shift * (length - 1.0) + + args.frame_length + args.frame_shift / 2) / 1000.0 + allowed_durations.append(d) + durs_fp.write("{}\n".format(d)) + lengths_fp.write("{}\n".format(length)) + d *= args.factor + return allowed_durations + + + +def perturb_utterances(utterances, allowed_durations, args): + """Given a set of utterances and a set of allowed durations, generate + an extended set of perturbed utterances (all having an allowed duration) + + Returns + ------- + perturbed_utterances: list of pertubed utterances + """ + + perturbed_utterances = [] + for u in utterances: + # find i such that: allowed_durations[i-1] <= u.dur <= allowed_durations[i] + # i = len(allowed_durations) --> no upper bound + # i = 0 --> no lower bound + if u.dur < allowed_durations[0]: + i = 0 + elif u.dur > allowed_durations[-1]: + i = len(allowed_durations) + else: + i = 1 + while i < len(allowed_durations): + if u.dur <= allowed_durations[i] and u.dur >= allowed_durations[i - 1]: + break + i += 1 + + if i > 0 and args.speed_perturb: # we have a smaller allowed duration + allowed_dur = allowed_durations[i - 1] + speed = u.dur / allowed_dur + if max(speed, 1.0/speed) > args.factor: # this could happen for very short/long utterances + continue + u1 = copy.deepcopy(u) + u1.id = 'pv1-' + u.id + u1.speaker = 'pv1-' + u.speaker + u1.wavefile = '{} sox -t wav - -t wav - speed {} | '.format(u.wavefile, speed) + u1.dur = allowed_dur + perturbed_utterances.append(u1) + + + if i < len(allowed_durations): # we have a larger allowed duration + allowed_dur2 = allowed_durations[i] + speed = u.dur / allowed_dur2 + if max(speed, 1.0/speed) > args.factor: + continue + + ## Add two versions for the second allowed_duration + ## one version is by using speed modification using sox + ## the other is by extending by silence + if args.speed_perturb: + u2 = copy.deepcopy(u) + u2.id = 'pv2-' + u.id + u2.speaker = 'pv2-' + u.speaker + u2.wavefile = '{} sox -t wav - -t wav - speed {} | '.format(u.wavefile, speed) + u2.dur = allowed_dur2 + perturbed_utterances.append(u2) + + delta = allowed_dur2 - u.dur + if delta <= 1e-4: + continue + u3 = copy.deepcopy(u) + u3.id = 'pv3-' + u.id + u3.speaker = 'pv3-' + u.speaker + u3.wavefile = '{} extend-wav-with-silence --extra-silence-length={} - - | '.format(u.wavefile, delta) + u3.dur = allowed_dur2 + perturbed_utterances.append(u3) + return perturbed_utterances + + + +def main(): + args = get_args() + args.factor = 1.0 + args.factor / 100.0 + + if not os.path.exists(args.dir): + os.makedirs(args.dir) + + utterances = read_kaldi_datadir(args.srcdir) + + start_dur, end_dur = find_duration_range(utterances, args.coverage_factor) + print("Durations in the range [{},{}] will be covered." + "Coverage rate: {}%".format(start_dur, end_dur, + 100.0 - args.coverage_factor * 2)) + print("There will be {} unique allowed lengths" + "for the utterances.".format(int(math.log(end_dur / start_dur) / + math.log(args.factor)))) + + allowed_durations = find_allowed_durations(start_dur, end_dur, args) + + perturbed_utterances = perturb_utterances(utterances, allowed_durations, + args) + + generate_kaldi_data_files(perturbed_utterances, args.dir) + + +if __name__ == '__main__': + main() diff --git a/src/chain/chain-denominator-smbr.cc b/src/chain/chain-denominator-smbr.cc index 0e0e895dd8f..b96cc9388f7 100644 --- a/src/chain/chain-denominator-smbr.cc +++ b/src/chain/chain-denominator-smbr.cc @@ -36,12 +36,12 @@ DenominatorSmbrComputation::DenominatorSmbrComputation( frames_per_sequence_(nnet_output.NumRows() / num_sequences_), exp_nnet_output_transposed_(nnet_output, kTrans), numerator_posteriors_transposed_(numerator_posteriors, kTrans), - nnet_output_deriv_transposed_( + nnet_output_acc_deriv_transposed_( exp_nnet_output_transposed_.NumRows(), std::min(exp_nnet_output_transposed_.NumCols(), static_cast(kMaxDerivTimeSteps) * num_sequences_)), - nnet_output_mmi_deriv_transposed_( + nnet_output_log_prob_deriv_transposed_( exp_nnet_output_transposed_.NumRows(), std::min(exp_nnet_output_transposed_.NumCols(), static_cast(kMaxDerivTimeSteps) * @@ -375,36 +375,51 @@ bool DenominatorSmbrComputation::BackwardSmbr( int32 chunk_frames = std::min(static_cast(kMaxDerivTimeSteps), frames_per_sequence_ - t), num_pdfs = exp_nnet_output_transposed_.NumRows(); - CuSubMatrix transposed_deriv_part( - nnet_output_deriv_transposed_, - 0, num_pdfs, - 0, chunk_frames * num_sequences_); - CuSubMatrix transposed_mmi_deriv_part( - nnet_output_mmi_deriv_transposed_, - 0, num_pdfs, - 0, chunk_frames * num_sequences_); + CuSubMatrix output_deriv_part( *nnet_output_deriv, t * num_sequences_, chunk_frames * num_sequences_, 0, num_pdfs); - output_deriv_part.AddMat(deriv_weight * opts_.smbr_factor, - transposed_deriv_part, kTrans); - output_deriv_part.AddMat(-deriv_weight * opts_.mmi_factor, - transposed_mmi_deriv_part, kTrans); - if (GetVerboseLevel() >= 2) { - CuVector deriv_sum(num_pdfs); - deriv_sum.AddColSumMat(1.0, transposed_deriv_part, 0.0); - CuVector mmi_deriv_sum(num_pdfs); - mmi_deriv_sum.AddColSumMat(1.0, transposed_mmi_deriv_part, 0.0); + // The following is needed so that the matrix will be of the same + // dimension as output_deriv_part. + CuSubMatrix transposed_log_prob_deriv_part( + nnet_output_log_prob_deriv_transposed_, + 0, num_pdfs, + 0, chunk_frames * num_sequences_); + output_deriv_part.AddMat(-deriv_weight * opts_.mmi_factor, + transposed_log_prob_deriv_part, kTrans); + + CuSubMatrix transposed_acc_deriv_part( + nnet_output_acc_deriv_transposed_, + 0, num_pdfs, + 0, chunk_frames * num_sequences_); + output_deriv_part.AddMat(deriv_weight * opts_.smbr_factor, + transposed_acc_deriv_part, kTrans); - deriv_sum.Write(KALDI_LOG, false); - mmi_deriv_sum.Write(KALDI_LOG, false); + if (GetVerboseLevel() >= 2) { + CuVector acc_deriv_sum(num_pdfs); + acc_deriv_sum.AddColSumMat(1.0, transposed_acc_deriv_part, 0.0); + CuVector log_prob_deriv_sum(num_pdfs); + log_prob_deriv_sum.AddColSumMat(1.0, transposed_log_prob_deriv_part, 0.0); + + CuSubMatrix transposed_num_post( + numerator_posteriors_transposed_, + 0, num_pdfs, + 0, chunk_frames * num_sequences_); + + acc_deriv_sum.Write(KALDI_LOG, false); + log_prob_deriv_sum.Write(KALDI_LOG, false); } + transposed_log_prob_deriv_part.MulColsGroupVec(tot_smbr_); + output_deriv_part.AddMat(-deriv_weight * opts_.smbr_factor, + transposed_log_prob_deriv_part, kTrans); - if (t != 0) - transposed_deriv_part.SetZero(); + if (t != 0) { + transposed_acc_deriv_part.SetZero(); + transposed_log_prob_deriv_part.SetZero(); + } } } return ok_; @@ -459,9 +474,9 @@ void DenominatorSmbrComputation::BetaSmbrDashGeneralFrame(int32 t) { t * num_sequences_, num_sequences_), numerator_post(numerator_posteriors_transposed_, 0, num_pdfs, t * num_sequences_, num_sequences_), - log_prob_deriv(nnet_output_deriv_transposed_, 0, num_pdfs, + acc_deriv(nnet_output_acc_deriv_transposed_, 0, num_pdfs, t_wrapped * num_sequences_, num_sequences_), - log_prob_mmi_deriv(nnet_output_mmi_deriv_transposed_, 0, num_pdfs, + log_prob_deriv(nnet_output_log_prob_deriv_transposed_, 0, num_pdfs, t_wrapped * num_sequences_, num_sequences_); int32 num_hmm_states = den_graph_.NumStates(), @@ -484,8 +499,8 @@ void DenominatorSmbrComputation::BetaSmbrDashGeneralFrame(int32 t) { this_alpha_dash, this_alpha_smbr, next_beta, next_beta_smbr, this_beta_dash, this_beta_smbr, - log_prob_deriv.Data(), log_prob_deriv.Stride(), - log_prob_mmi_deriv.Data(), log_prob_mmi_deriv.Stride()); + acc_deriv.Data(), acc_deriv.Stride(), + log_prob_deriv.Data(), log_prob_deriv.Stride()); CU_SAFE_CALL(cudaGetLastError()); if (dimGrid.y == num_hmm_states) { break; // this is the normal case. @@ -507,12 +522,12 @@ void DenominatorSmbrComputation::BetaSmbrDashGeneralFrame(int32 t) { { int32 prob_stride = probs.Stride(), post_stride = numerator_post.Stride(), - deriv_stride = log_prob_deriv.Stride(), - mmi_deriv_stride = log_prob_mmi_deriv.Stride(); + acc_deriv_stride = acc_deriv.Stride(), + log_prob_deriv_stride = log_prob_deriv.Stride(); const BaseFloat *prob_data = probs.Data(); const BaseFloat *post_data = numerator_post.Data(); + BaseFloat *acc_deriv_data = acc_deriv.Data(); BaseFloat *log_prob_deriv_data = log_prob_deriv.Data(); - BaseFloat *log_prob_mmi_deriv_data = log_prob_mmi_deriv.Data(); for (int32 h = 0; h < num_hmm_states; h++) { for (int32 s = 0; s < num_sequences; s++) { BaseFloat this_alpha_dash_prob = this_alpha_dash[h * num_sequences + s], @@ -537,11 +552,11 @@ void DenominatorSmbrComputation::BetaSmbrDashGeneralFrame(int32 t) { tot_beta_smbr += (next_beta_smbr_j + post) * variable_factor; tot_variable_factor += variable_factor; BaseFloat occupation_prob = occupation_factor * variable_factor; - double this_gamma_r = occupation_prob * - (this_alpha_smbr_i + post + next_beta_smbr_j - tot_smbr_(s)); - log_prob_deriv_data[pdf_id * deriv_stride + s] += - this_gamma_r; - log_prob_mmi_deriv_data[pdf_id * mmi_deriv_stride + s] += + BaseFloat this_acc_r = occupation_prob * + (this_alpha_smbr_i + post + next_beta_smbr_j); + acc_deriv_data[pdf_id * acc_deriv_stride + s] += + this_acc_r; + log_prob_deriv_data[pdf_id * log_prob_deriv_stride + s] += occupation_prob; } this_beta_dash[h * num_sequences + s] = @@ -565,15 +580,8 @@ void DenominatorSmbrComputation::BetaSmbrGeneralFrameDebug(int32 t) { this_beta_smbr(beta_smbr_.RowData(t % 2), alpha_beta_size); int32 t_wrapped = t % static_cast(kMaxDerivTimeSteps), num_pdfs = exp_nnet_output_transposed_.NumRows(); - CuSubMatrix this_log_prob_deriv( - nnet_output_deriv_transposed_, 0, num_pdfs, - t_wrapped * num_sequences_, num_sequences_); - CuSubMatrix this_log_prob_mmi_deriv( - nnet_output_mmi_deriv_transposed_, 0, num_pdfs, - t_wrapped * num_sequences_, num_sequences_); BaseFloat alpha_beta_product = VecVec(this_alpha_dash, - this_beta_dash), - this_log_prob_mmi_deriv_sum = this_log_prob_mmi_deriv.Sum(); + this_beta_dash); if (!ApproxEqual(alpha_beta_product, num_sequences_)) { KALDI_WARN << "On time " << t << ", alpha-beta product " << alpha_beta_product << " != " << num_sequences_ @@ -611,16 +619,43 @@ void DenominatorSmbrComputation::BetaSmbrGeneralFrameDebug(int32 t) { << alpha_beta_smbr_sum << " = tot-smbr-sum"; } - // use higher tolerance, since we are using randomized pruning for the - // log-prob derivatives. - if (GetVerboseLevel() > 1 || !ApproxEqual( - this_log_prob_mmi_deriv_sum, num_sequences_, 0.01)) { - KALDI_WARN << "On time " << t << ", log-prob-mmi-deriv sum " - << this_log_prob_mmi_deriv_sum << " != " - << num_sequences_; - if (fabs(this_log_prob_mmi_deriv_sum - num_sequences_) > 2.0) { - KALDI_WARN << "Excessive error detected, will abandon this minibatch"; - ok_ = false; + { + CuSubMatrix this_log_prob_deriv( + nnet_output_log_prob_deriv_transposed_, 0, num_pdfs, + t_wrapped * num_sequences_, num_sequences_); + BaseFloat this_log_prob_deriv_sum = this_log_prob_deriv.Sum(); + // use higher tolerance, since we are using randomized pruning for the + // log-prob derivatives. + if (GetVerboseLevel() > 1 || !ApproxEqual( + this_log_prob_deriv_sum, num_sequences_, 0.01)) { + KALDI_WARN << "On time " << t << ", log-prob-deriv sum " + << this_log_prob_deriv_sum << " != " + << num_sequences_; + if (fabs(this_log_prob_deriv_sum - num_sequences_) > 2.0) { + KALDI_WARN << "Excessive error detected, will abandon this minibatch"; + ok_ = false; + } + } + } + + { + BaseFloat tot_smbr = tot_smbr_.Sum(); + + CuSubMatrix this_acc_deriv( + nnet_output_acc_deriv_transposed_, 0, num_pdfs, + t_wrapped * num_sequences_, num_sequences_); + BaseFloat this_acc_deriv_sum = this_acc_deriv.Sum(); + // use higher tolerance, since we are using randomized pruning for the + // log-prob derivatives. + if (GetVerboseLevel() > 1 || !ApproxEqual( + this_acc_deriv_sum, tot_smbr, 0.01)) { + KALDI_WARN << "On time " << t << ", acc-deriv sum " + << this_acc_deriv_sum << " != " + << tot_smbr; + if (fabs(this_acc_deriv_sum - tot_smbr) > 2.0) { + KALDI_WARN << "Excessive error detected, will abandon this minibatch"; + ok_ = false; + } } } } diff --git a/src/chain/chain-denominator-smbr.h b/src/chain/chain-denominator-smbr.h index b27415f98cc..ca526d68f2e 100644 --- a/src/chain/chain-denominator-smbr.h +++ b/src/chain/chain-denominator-smbr.h @@ -299,10 +299,10 @@ class DenominatorSmbrComputation { CuMatrix numerator_posteriors_transposed_; // the smbr derivs w.r.t. the nnet outputs (transposed) - CuMatrix nnet_output_deriv_transposed_; + CuMatrix nnet_output_acc_deriv_transposed_; - // the mmi derivs w.r.t. the nnet outputs (transposed) - CuMatrix nnet_output_mmi_deriv_transposed_; + // the log-prob derivs w.r.t. the nnet outputs (transposed) + CuMatrix nnet_output_log_prob_deriv_transposed_; // the (temporarily) alpha and (more permanently) alpha-dash probabilities; // dimension is (frames_per_sequence + 1) by (num-hmm-states * num-sequences + diff --git a/src/chain/chain-kernels-ansi.h b/src/chain/chain-kernels-ansi.h index c772ce10197..d2040b6edc2 100644 --- a/src/chain/chain-kernels-ansi.h +++ b/src/chain/chain-kernels-ansi.h @@ -64,10 +64,10 @@ extern "C" { const BaseFloat *next_beta_smbr, BaseFloat *this_beta, BaseFloat *this_beta_smbr, + BaseFloat *acc_deriv, + int32_cuda acc_deriv_stride, BaseFloat *log_prob_deriv, - int32_cuda log_prob_deriv_stride, - BaseFloat *log_prob_mmi_deriv, - int32_cuda log_prob_mmi_deriv_stride); + int32_cuda log_prob_deriv_stride); void cuda_chain_smbr_hmm_forward(dim3 Gr, dim3 Bl, const Int32Pair *backward_transitions, diff --git a/src/chain/chain-smbr-kernels.cu b/src/chain/chain-smbr-kernels.cu index 04c33aecf18..b7fef9b6fb0 100644 --- a/src/chain/chain-smbr-kernels.cu +++ b/src/chain/chain-smbr-kernels.cu @@ -216,8 +216,8 @@ static void _cuda_chain_smbr_hmm_backward( const BaseFloat *this_alpha, const BaseFloat *this_alpha_smbr, const BaseFloat *next_beta, const BaseFloat *next_beta_smbr, BaseFloat *this_beta, BaseFloat *this_beta_smbr, - BaseFloat *log_prob_deriv, int32_cuda log_prob_deriv_stride, - BaseFloat *log_prob_mmi_deriv, int32_cuda log_prob_mmi_deriv_stride) { + BaseFloat *acc_deriv, int32_cuda acc_deriv_stride, + BaseFloat *log_prob_deriv, int32_cuda log_prob_deriv_stride) { // 'forward_transitions', indexed by hmm-state, consists of [start, end] // indexes into the 'transition_info' array. This is about the transitions // *out of* this state. 'probs' contains the exponentiated neural net @@ -277,18 +277,18 @@ static void _cuda_chain_smbr_hmm_backward( + (next_beta_smbr_j1 + num_post1) * variable_factor1; tot_variable_factor += variable_factor0 + variable_factor1; BaseFloat occupation_prob0 = variable_factor0 * occupation_factor; - BaseFloat this_gamma_r0 = occupation_prob0 - * (this_alpha_smbr_i + num_post0 + next_beta_smbr_j0 - tot_smbr[s]); + BaseFloat this_acc_r0 = occupation_prob0 + * (this_alpha_smbr_i + num_post0 + next_beta_smbr_j0); + atomic_add(acc_deriv + (pdf_id0 * acc_deriv_stride + s), + this_acc_r0); atomic_add(log_prob_deriv + (pdf_id0 * log_prob_deriv_stride + s), - this_gamma_r0); - atomic_add(log_prob_mmi_deriv + (pdf_id0 * log_prob_mmi_deriv_stride + s), occupation_prob0); BaseFloat occupation_prob1 = variable_factor1 * occupation_factor; - BaseFloat this_gamma_r1 = occupation_prob1 - * (this_alpha_smbr_i + num_post1 + next_beta_smbr_j1 - tot_smbr[s]); + BaseFloat this_acc_r1 = occupation_prob1 + * (this_alpha_smbr_i + num_post1 + next_beta_smbr_j1); + atomic_add(acc_deriv + (pdf_id1 * acc_deriv_stride + s), + this_acc_r1); atomic_add(log_prob_deriv + (pdf_id1 * log_prob_deriv_stride + s), - this_gamma_r1); - atomic_add(log_prob_mmi_deriv + (pdf_id1 * log_prob_mmi_deriv_stride + s), occupation_prob1); } if (trans_iter != trans_end) { @@ -304,11 +304,11 @@ static void _cuda_chain_smbr_hmm_backward( tot_beta_smbr += (next_beta_smbr_j0 + num_post0) * variable_factor0; tot_variable_factor += variable_factor0; BaseFloat occupation_prob0 = variable_factor0 * occupation_factor; - BaseFloat this_gamma_r0 = occupation_prob0 - * (this_alpha_smbr_i + num_post0 + next_beta_smbr_j0 - tot_smbr[s]); + BaseFloat this_acc_r0 = occupation_prob0 + * (this_alpha_smbr_i + num_post0 + next_beta_smbr_j0); + atomic_add(acc_deriv + (pdf_id0 * acc_deriv_stride + s), + this_acc_r0); atomic_add(log_prob_deriv + (pdf_id0 * log_prob_deriv_stride + s), - this_gamma_r0); - atomic_add(log_prob_mmi_deriv + (pdf_id0 * log_prob_mmi_deriv_stride + s), occupation_prob0); } BaseFloat beta = tot_variable_factor / inv_arbitrary_scale; @@ -351,16 +351,16 @@ void cuda_chain_smbr_hmm_backward( const BaseFloat *this_alpha, const BaseFloat *this_alpha_smbr, const BaseFloat *next_beta, const BaseFloat *next_beta_smbr, BaseFloat *this_beta, BaseFloat *this_beta_smbr, + BaseFloat *acc_deriv, + int32_cuda acc_deriv_stride, BaseFloat *log_prob_deriv, - int32_cuda log_prob_deriv_stride, - BaseFloat *log_prob_mmi_deriv, - int32_cuda log_prob_mmi_deriv_stride) { + int32_cuda log_prob_deriv_stride) { _cuda_chain_smbr_hmm_backward<<>>( forward_transitions, transitions, num_sequences, num_hmm_states, probs, prob_stride, num_post, post_stride, tot_smbr, this_alpha, this_alpha_smbr, next_beta, next_beta_smbr, this_beta, this_beta_smbr, - log_prob_deriv, log_prob_deriv_stride, - log_prob_mmi_deriv, log_prob_mmi_deriv_stride); + acc_deriv, acc_deriv_stride, + log_prob_deriv, log_prob_deriv_stride); } diff --git a/src/chain/chain-training.cc b/src/chain/chain-training.cc index 4123dea79f9..a14fedf84ea 100644 --- a/src/chain/chain-training.cc +++ b/src/chain/chain-training.cc @@ -136,14 +136,14 @@ void ComputeChainSmbrObjfAndDeriv(const ChainTrainingOptions &opts, BaseFloat *l2_term, BaseFloat *weight, CuMatrixBase *nnet_output_deriv, - CuMatrixBase *xent_output_deriv, + CuMatrix *xent_output_deriv, const CuArray *sil_indices) { - // num_posteriors is a matrix of size + // numerator_post is a matrix of size // (num_sequences * frames_per_sequence) x num_pdfs and is ordered in the // same way as nnet_output is i.e. // first the first frame of each sequence, then the second frame of // each sequence, and so on. - CuMatrix num_posteriors(nnet_output.NumRows(), + CuMatrix numerator_post(nnet_output.NumRows(), nnet_output.NumCols()); BaseFloat num_logprob_weighted; @@ -151,57 +151,69 @@ void ComputeChainSmbrObjfAndDeriv(const ChainTrainingOptions &opts, NumeratorComputation numerator(supervision, nnet_output); // note: supervision.weight is included as a factor in the derivative from // the numerator object, and the logprob too. - num_logprob_weighted = opts.mmi_factor * numerator.Forward(); - numerator.Backward(&num_posteriors); + num_logprob_weighted = (opts.mmi_factor + opts.ml_factor) * numerator.Forward(); + numerator.Backward(&numerator_post); #if HAVE_CUDA == 1 - if (!CuDevice::Instantiate().Enabled() && GetVerboseLevel() >= 2) { - Posterior post(num_posteriors.NumRows()); - for (int32 i = 0; i < num_posteriors.NumRows(); i++) { - CuSubVector row(num_posteriors, i); - for (int32 j = 0; j < row.Dim(); j++) { - BaseFloat p = row(j); - if (p >= 0.01) { - post[i].push_back(std::make_pair(j, p)); + if (!CuDevice::Instantiate().Enabled()) +#endif + { // Debugging + if (GetVerboseLevel() >= 2) { + Posterior post(numerator_post.NumRows()); + for (int32 i = 0; i < numerator_post.NumRows(); i++) { + CuSubVector row(numerator_post, i); + for (int32 j = 0; j < row.Dim(); j++) { + BaseFloat p = row(j); + if (p >= 0.01) { + post[i].push_back(std::make_pair(j, p)); + } } } + PosteriorHolder::Write(KALDI_LOG, false, post); } - PosteriorHolder::Write(KALDI_LOG, false, post); } -#endif - if (nnet_output_deriv && opts.mmi_factor != 0.0) { - nnet_output_deriv->CopyFromMat(num_posteriors); - nnet_output_deriv->Scale(opts.mmi_factor); + if (nnet_output_deriv && (opts.mmi_factor != 0.0 || opts.ml_factor != 0.0)) { + nnet_output_deriv->CopyFromMat(numerator_post); + nnet_output_deriv->Scale(opts.mmi_factor + opts.ml_factor); } if (xent_output_deriv) { - xent_output_deriv->CopyFromMat(num_posteriors); + xent_output_deriv->Resize(nnet_output.NumRows(), nnet_output.NumCols()); + xent_output_deriv->CopyFromMat(numerator_post); } } if (sil_indices && opts.exclude_silence) { - // Exclude numerator posteriors for silence pdfs from accuracy - // computation. This is done by setting silence pdf posteiors to zero. - // sil_indices is expected to have -1 at the indexes corresponding to + // Exclude numerator posteriors for silence pdfs from accuracy + // computation. This is done by setting silence pdf posteriors to zero. + // sil_indices is expected to have -1 at the indexes corresponding to // silence pdfs, and "i" for any other index "i". - num_posteriors.CopyCols(num_posteriors, *sil_indices); + numerator_post.CopyCols(numerator_post, *sil_indices); } else if (sil_indices && opts.one_silence_class) { // Create a copy with only the silence pdf posteriors. CuMatrix silence_post(nnet_output.NumRows(), nnet_output.NumCols()); - silence_post.CopyCols(num_posteriors, *sil_indices); + silence_post.CopyCols(numerator_post, *sil_indices); // Sum the posteriors of silence pdfs to get posterior of silence class. CuVector total_silence_post(nnet_output.NumRows()); total_silence_post.AddColSumMat(1.0, silence_post, 0.0); // Copy the silence class posterior to the columns of the silence pdfs. - num_posteriors.CopyColsFromVec(total_silence_post, *sil_indices); + numerator_post.CopyColsFromVec(total_silence_post, *sil_indices); + } + + if (opts.smbr_threshold > 0) { + // Consider all posteriors below smbr_threshold to be 0. + CuMatrix tmp(numerator_post); + tmp.Add(-opts.smbr_threshold); + tmp.ApplyHeaviside(); + numerator_post.MulElements(tmp); } DenominatorSmbrComputation denominator(opts, den_graph, supervision.num_sequences, - nnet_output, num_posteriors); + nnet_output, numerator_post); BaseFloat den_logprob_negated; BaseFloat smbr_objf = denominator.ForwardSmbr(&den_logprob_negated); @@ -215,7 +227,7 @@ void ComputeChainSmbrObjfAndDeriv(const ChainTrainingOptions &opts, bool ok = true; if (nnet_output_deriv) { - if (opts.mmi_factor == 0.0) nnet_output_deriv->SetZero(); + if (opts.mmi_factor == 0.0 && opts.ml_factor == 0.0) nnet_output_deriv->SetZero(); ok = denominator.BackwardSmbr(supervision.weight, nnet_output_deriv); } @@ -223,7 +235,7 @@ void ComputeChainSmbrObjfAndDeriv(const ChainTrainingOptions &opts, *mmi_objf = supervision.weight * den_logprob_negated + num_logprob_weighted; *weight = supervision.weight * supervision.num_sequences * supervision.frames_per_sequence; - + BaseFloat total_objf = *objf + *mmi_objf; if (!((total_objf) - (total_objf) == 0) || !ok) { // inf or NaN detected, or denominator computation returned false. @@ -231,7 +243,7 @@ void ComputeChainSmbrObjfAndDeriv(const ChainTrainingOptions &opts, nnet_output_deriv->SetZero(); if (xent_output_deriv) xent_output_deriv->SetZero(); - BaseFloat default_objf = -opts.mmi_factor * 10; + BaseFloat default_objf = -(opts.mmi_factor + opts.ml_factor) * 10; KALDI_WARN << "Objective function is " << (total_objf) << " and denominator computation (if done) returned " << std::boolalpha << ok diff --git a/src/chain/chain-training.h b/src/chain/chain-training.h index 92573bc57ed..200786b9164 100644 --- a/src/chain/chain-training.h +++ b/src/chain/chain-training.h @@ -68,16 +68,18 @@ struct ChainTrainingOptions { std::string silence_pdfs_str; BaseFloat mmi_factor; + BaseFloat ml_factor; BaseFloat smbr_factor; + BaseFloat smbr_threshold; bool norm_regularize; ChainTrainingOptions(): l2_regularize(0.0), leaky_hmm_coefficient(1.0e-05), xent_regularize(0.0), use_smbr_objective(false), exclude_silence(false), one_silence_class(false), - mmi_factor(0.0), smbr_factor(1.0), + mmi_factor(1.0), ml_factor(0.0), smbr_factor(0.0), smbr_threshold(0.0), norm_regularize(false) { } - + void Register(OptionsItf *opts) { opts->Register("l2-regularize", &l2_regularize, "l2 regularization " "constant for 'chain' training, applied to the output " @@ -105,6 +107,9 @@ struct ChainTrainingOptions { opts->Register("mmi-factor", &mmi_factor, "When using smbr objective, interpolate mmi objective " "with this weight"); + opts->Register("ml-factor", &ml_factor, + "When using smbr objective, interpolate ml objective " + "with this weight"); opts->Register("smbr-factor", &smbr_factor, "When using smbr objective, interpolate smbr objective " "with this weight"); @@ -117,6 +122,8 @@ struct ChainTrainingOptions { "Treat all silence pdfs as a single class for accuracy " "computation in smBR training. --silence-pdfs is required " "if this options is true."); + opts->Register("smbr-threshold", &smbr_threshold, + "Posterior below this value is considered 0"); } }; From f94738faa3f6c3e68e70d980e2cdbce2152e1bad Mon Sep 17 00:00:00 2001 From: Pegita Date: Sun, 25 Feb 2018 22:07:04 -0500 Subject: [PATCH 127/174] modfied functions to accept new sort (sort by t and then n) in nnet3-merge-egs. --- src/matrix/sparse-matrix.cc | 83 ++++++++++++++++++++++++++------- src/matrix/sparse-matrix.h | 11 ++++- src/nnet3/nnet-example-utils.cc | 23 +++++++-- src/nnet3/nnet-example-utils.h | 18 ++++--- 4 files changed, 104 insertions(+), 31 deletions(-) diff --git a/src/matrix/sparse-matrix.cc b/src/matrix/sparse-matrix.cc index 38ad940fb45..5ad7f2bfeca 100644 --- a/src/matrix/sparse-matrix.cc +++ b/src/matrix/sparse-matrix.cc @@ -654,26 +654,50 @@ void SparseMatrix::Resize(MatrixIndexT num_rows, template void SparseMatrix::AppendSparseMatrixRows( - std::vector > *inputs) { + std::vector > *inputs, + bool sort_by_t) { rows_.clear(); size_t num_rows = 0; typename std::vector >::iterator input_iter = inputs->begin(), input_end = inputs->end(); - for (; input_iter != input_end; ++input_iter) + int32 local_row_size = input_iter->rows_.size(), + num_inputs = inputs->size(); + for (; input_iter != input_end; ++input_iter) { num_rows += input_iter->rows_.size(); + if (sort_by_t) + if (input_iter->rows_.size() == local_row_size) + KALDI_ERR << "we can not append sparse matrices with inconsistent " + << " number of rows, if sort_by_t is true"; + } rows_.resize(num_rows); typename std::vector >::iterator row_iter = rows_.begin(), row_end = rows_.end(); - for (input_iter = inputs->begin(); input_iter != input_end; ++input_iter) { - typename std::vector >::iterator - input_row_iter = input_iter->rows_.begin(), - input_row_end = input_iter->rows_.end(); - for (; input_row_iter != input_row_end; ++input_row_iter, ++row_iter) - row_iter->Swap(&(*input_row_iter)); + if (sort_by_t) { + // If true, the matrices appended to be sorted first by original row index (t) and next by matrix order in input. + // i.e. all rows with same index in local input matrix are appended in a same block. + int32 n = 0, t = 0; // 'n' is index over matrices and 't' is index for rows in matrixes. + for (input_iter = inputs->begin(); input_iter != input_end; ++input_iter, ++n) { + typename std::vector >::iterator + input_row_iter = input_iter->rows_.begin(), + input_row_end = input_iter->rows_.end(); + t = 0; + for (; input_row_iter != input_row_end; ++input_row_iter, ++t) { + int32 src_row_index = n + t * num_inputs; + rows_[src_row_index].Swap(&(*input_row_iter)); + } + } + } else { + for (input_iter = inputs->begin(); input_iter != input_end; ++input_iter) { + typename std::vector >::iterator + input_row_iter = input_iter->rows_.begin(), + input_row_end = input_iter->rows_.end(); + for (; input_row_iter != input_row_end; ++input_row_iter, ++row_iter) + row_iter->Swap(&(*input_row_iter)); + } + KALDI_ASSERT(row_iter == row_end); } - KALDI_ASSERT(row_iter == row_end); int32 num_cols = NumCols(); for (row_iter = rows_.begin(); row_iter != row_end; ++row_iter) { if (row_iter->Dim() != num_cols) @@ -916,7 +940,8 @@ void GeneralMatrix::Read(std::istream &is, bool binary) { void AppendGeneralMatrixRows(const std::vector &src, - GeneralMatrix *mat) { + GeneralMatrix *mat, + bool sort_by_t) { mat->Clear(); int32 size = src.size(); if (size == 0) @@ -933,7 +958,7 @@ void AppendGeneralMatrixRows(const std::vector &src, for (int32 i = 0; i < size; i++) sparse_mats[i] = src[i]->GetSparseMatrix(); SparseMatrix appended_mat; - appended_mat.AppendSparseMatrixRows(&sparse_mats); + appended_mat.AppendSparseMatrixRows(&sparse_mats, sort_by_t); mat->SwapSparseMatrix(&appended_mat); } else { int32 tot_rows = 0, num_cols = -1; @@ -950,15 +975,37 @@ void AppendGeneralMatrixRows(const std::vector &src, } Matrix appended_mat(tot_rows, num_cols, kUndefined); int32 row_offset = 0; - for (int32 i = 0; i < size; i++) { - const GeneralMatrix &src_mat = *(src[i]); - int32 src_rows = src_mat.NumRows(); - if (src_rows != 0) { - SubMatrix dest_submat(appended_mat, row_offset, src_rows, - 0, num_cols); - src_mat.CopyToMat(&dest_submat); + if (sort_by_t) { + // reorder the src mat rows to be inserted in appended matrix, in order to + // have sorted matrix first by 't' and next by 'n'. + int32 local_row_size = src[0]->NumRows(); + for (int32 i = 0; i < size; i++) { + const GeneralMatrix &src_mat = *(src[i]); + Matrix full_src_mat(src_mat.NumRows(), src_mat.NumCols()); + src_mat.CopyToMat(&full_src_mat); + int32 src_rows = src_mat.NumRows(); + if (src_rows != local_row_size) + KALDI_ERR << "Appending rows of matrices with inconsistent num-rows " + << "with sort-by-t=true is not possible:"; + std::vector reorder_indexes(local_row_size, + static_cast(NULL)); + for (int32 j = 0; j < src_rows; j++) { + reorder_indexes[j] = j * size + i; + } + full_src_mat.AddToRows(1.0, &(reorder_indexes[0]), &appended_mat); row_offset += src_rows; } + } else { + for (int32 i = 0; i < size; i++) { + const GeneralMatrix &src_mat = *(src[i]); + int32 src_rows = src_mat.NumRows(); + if (src_rows != 0) { + SubMatrix dest_submat(appended_mat, row_offset, src_rows, + 0, num_cols); + src_mat.CopyToMat(&dest_submat); + row_offset += src_rows; + } + } } KALDI_ASSERT(row_offset == tot_rows); mat->SwapFullMatrix(&appended_mat); diff --git a/src/matrix/sparse-matrix.h b/src/matrix/sparse-matrix.h index 60085b93fbe..48e085f1e4f 100644 --- a/src/matrix/sparse-matrix.h +++ b/src/matrix/sparse-matrix.h @@ -201,7 +201,13 @@ class SparseMatrix { /// function is destructive of the inputs. Requires, obviously, /// that the inputs all have the same dimension (although some may be /// empty). - void AppendSparseMatrixRows(std::vector > *inputs); + /// + /// If sort_by_t is true, all sparse matrixes are appended in a way to be sorted + /// w.r.t their local row indexes and then sorted with matrix index. + /// i.e. all rows of matrixes with same index are in same block. + /// Also number of rows in all matrixes needs to be equal. + void AppendSparseMatrixRows(std::vector > *inputs, + bool sort_by_t=false); SparseMatrix() { } @@ -383,7 +389,8 @@ class GeneralMatrix { /// Does not preserve compression, if inputs were compressed; you have to /// re-compress manually, if that's what you need. void AppendGeneralMatrixRows(const std::vector &src, - GeneralMatrix *mat); + GeneralMatrix *mat, + bool sort_by_t = false); /// Outputs a SparseMatrix containing only the rows r of "in" such that diff --git a/src/nnet3/nnet-example-utils.cc b/src/nnet3/nnet-example-utils.cc index 62fc88521bc..82fbee1cf22 100644 --- a/src/nnet3/nnet-example-utils.cc +++ b/src/nnet3/nnet-example-utils.cc @@ -89,7 +89,8 @@ static void MergeIo(const std::vector &src, const std::vector &names, const std::vector &sizes, bool compress, - NnetExample *merged_eg) { + NnetExample *merged_eg, + bool sort_by_t) { // The total number of Indexes we have across all examples. int32 num_feats = names.size(); @@ -143,13 +144,24 @@ static void MergeIo(const std::vector &src, "Merging already-merged egs? Not currentlysupported."); output_iter[i].n = n; } + this_offset += this_size; // note: this_offset is a reference. } } + // If sort_by_t is true, the indexes is rearranged to be sorted + // first by 't' and next by 'n'. + for (int32 f = 0; f < num_feats; f++) { + NnetIo output_io = merged_eg->io[f]; + if (sort_by_t) + if (output_io.name == "output") + std::sort(output_io.indexes.begin(), output_io.indexes.end()); + } + KALDI_ASSERT(cur_size == sizes); for (int32 f = 0; f < num_feats; f++) { AppendGeneralMatrixRows(output_lists[f], - &(merged_eg->io[f].features)); + &(merged_eg->io[f].features), + sort_by_t); if (compress) { // the following won't do anything if the features were sparse. merged_eg->io[f].features.Compress(); @@ -161,14 +173,15 @@ static void MergeIo(const std::vector &src, void MergeExamples(const std::vector &src, bool compress, - NnetExample *merged_eg) { + NnetExample *merged_eg, + bool sort_by_t) { KALDI_ASSERT(!src.empty()); std::vector io_names; GetIoNames(src, &io_names); // the sizes are the total number of Indexes we have across all examples. std::vector io_sizes; GetIoSizes(src, io_names, &io_sizes); - MergeIo(src, io_names, io_sizes, compress, merged_eg); + MergeIo(src, io_names, io_sizes, compress, merged_eg, sort_by_t); } void ShiftExampleTimes(int32 t_offset, @@ -1225,7 +1238,7 @@ void ExampleMerger::WriteMinibatch(const std::vector &egs) { int32 minibatch_size = egs.size(); stats_.WroteExample(eg_size, structure_hash, minibatch_size); NnetExample merged_eg; - MergeExamples(egs, config_.compress, &merged_eg); + MergeExamples(egs, config_.compress, &merged_eg, config_.sort_by_t); std::ostringstream key; key << "merged-" << (num_egs_written_++) << "-" << minibatch_size; writer_->Write(key.str(), merged_eg); diff --git a/src/nnet3/nnet-example-utils.h b/src/nnet3/nnet-example-utils.h index 5f6c69f7d96..9d55f3b0d7d 100644 --- a/src/nnet3/nnet-example-utils.h +++ b/src/nnet3/nnet-example-utils.h @@ -33,10 +33,14 @@ namespace nnet3 { /** Merge a set of input examples into a single example (typically the size of "src" will be the minibatch size). Will crash if "src" is the empty vector. If "compress" is true, it will compress any non-sparse features in the output. + + If sort_by_t is true, the examples and indexes for output are sorted first + by 't' and then by 'n' index. */ void MergeExamples(const std::vector &src, bool compress, - NnetExample *dest); + NnetExample *dest, + bool sort_by_t = false); /** Shifts the time-index t of everything in the "eg" by adding "t_offset" to @@ -334,12 +338,14 @@ class ExampleMergingConfig { std::string measure_output_frames; // for back-compatibility, not used. std::string minibatch_size; std::string discard_partial_minibatches; // for back-compatibility, not used. - + bool sort_by_t; // If true, the examples and indexes are sorted + // first by 't' and next by 'n'. ExampleMergingConfig(const char *default_minibatch_size = "256"): compress(false), measure_output_frames("deprecated"), minibatch_size(default_minibatch_size), - discard_partial_minibatches("deprecated") { } + discard_partial_minibatches("deprecated"), + sort_by_t(false) { } void Register(OptionsItf *po) { po->Register("compress", &compress, "If true, compress the output examples " @@ -363,6 +369,9 @@ class ExampleMergingConfig { "--minibatch-size=128=64:128,256/256=32:64,128. Egs are given " "minibatch-sizes based on the specified eg-size closest to " "their actual size."); + po->Register("sort-by-t", &sort_by_t, + "If true, the features in examples and indexes are sorted " + "first by 't' and next by 'n'."); } @@ -517,7 +526,6 @@ class ExampleMerger { const ExampleMergingConfig &config_; NnetExampleWriter *writer_; ExampleMergingStats stats_; - // Note: the "key" into the egs is the first element of the vector. typedef unordered_map, NnetExampleStructureHasher, @@ -525,8 +533,6 @@ class ExampleMerger { MapType eg_to_egs_; }; - - } // namespace nnet3 } // namespace kaldi From 40fa1541cfa5cd4403ecc5ac11af836c656b9266 Mon Sep 17 00:00:00 2001 From: Pegita Date: Tue, 27 Feb 2018 15:52:45 -0500 Subject: [PATCH 128/174] fixed some issues. --- src/chain/chain-supervision.cc | 29 +-------------------- src/chain/chain-supervision.h | 7 +++-- src/chainbin/nnet3-chain-get-egs-post.cc | 33 +++++++++++------------- src/chainbin/nnet3-chain-get-egs.cc | 4 +-- 4 files changed, 20 insertions(+), 53 deletions(-) diff --git a/src/chain/chain-supervision.cc b/src/chain/chain-supervision.cc index 7d87201dfdd..c38cd4698f7 100644 --- a/src/chain/chain-supervision.cc +++ b/src/chain/chain-supervision.cc @@ -683,34 +683,7 @@ bool AddWeightToFst(const fst::StdVectorFst &normalization_fst, bool AddWeightToSupervisionFst(const fst::StdVectorFst &normalization_fst, Supervision *supervision) { - // remove epsilons before composing. 'normalization_fst' has noepsilons so - // the composed result will be epsilon free. - fst::StdVectorFst supervision_fst_noeps(supervision->fst); - fst::RmEpsilon(&supervision_fst_noeps); - if (!TryDeterminizeMinimize(kSupervisionMaxStates, - &supervision_fst_noeps)) - return false; - - // note: by default, 'Compose' will call 'Connect', so if the - // resulting FST is not connected, it will end up empty. - fst::StdVectorFst composed_fst; - fst::Compose(supervision_fst_noeps, normalization_fst, - &composed_fst); - if (composed_fst.NumStates() == 0) - return false; - // projection should not be necessary, as both FSTs are acceptors. - // determinize and minimize to make it as compact as possible. - - if (!TryDeterminizeMinimize(kSupervisionMaxStates, - &composed_fst)) - return false; - supervision->fst = composed_fst; - - // Make sure the states are numbered in increasing order of time. - SortBreadthFirstSearch(&(supervision->fst)); - KALDI_ASSERT(supervision->fst.Properties(fst::kAcceptor, true) == fst::kAcceptor); - KALDI_ASSERT(supervision->fst.Properties(fst::kIEpsilons, true) == 0); - return true; + return AddWeightToFst(normalization_fst, &(supervision->fst)); } void SplitIntoRanges(int32 num_frames, diff --git a/src/chain/chain-supervision.h b/src/chain/chain-supervision.h index c54d4770aa0..36401009b15 100644 --- a/src/chain/chain-supervision.h +++ b/src/chain/chain-supervision.h @@ -320,13 +320,12 @@ class SupervisionSplitter { /// This function also removes epsilons and makes sure supervision->fst has the /// required sorting of states. Think of it as the final stage in preparation /// of the supervision FST. -bool AddWeightToSupervisionFst(const fst::StdVectorFst &normalization_fst, - Supervision *supervision); - - bool AddWeightToFst(const fst::StdVectorFst &normalization_fst, fst::StdVectorFst *supervision_fst); +bool AddWeightToSupervisionFst(const fst::StdVectorFst &normalization_fst, + Supervision *supervision); + /// Assuming the 'fst' is epsilon-free, connected, and has the property that all /// paths from the start-state are of the same length, output a vector /// containing that length (from the start-state to the current state) to diff --git a/src/chainbin/nnet3-chain-get-egs-post.cc b/src/chainbin/nnet3-chain-get-egs-post.cc index 9aa0eba0fb8..f3b82f2229d 100644 --- a/src/chainbin/nnet3-chain-get-egs-post.cc +++ b/src/chainbin/nnet3-chain-get-egs-post.cc @@ -134,13 +134,7 @@ static bool ProcessFile(const fst::StdVectorFst &normalization_fst, } int32 frame_subsampling_factor = utt_splitter->Config().frame_subsampling_factor; - - fst::StdVectorFst sup_fst, - scaled_normalization_fst(normalization_fst); - ConvertLatticeToPdfLabels(tmodel, lat, &sup_fst); - ScaleFst(0.5, &scaled_normalization_fst); // Scale lattice to have weights similar - // to weights used to combine lm weight - // with acoustic weight in sup_lat + fst::StdVectorFst sup_fst; if (normalization_fst.NumStates() > 0 && !chain::AddWeightToFst(normalization_fst, &sup_fst)) { KALDI_WARN << "For utterance " << utt_id << ", feature frames " @@ -249,15 +243,13 @@ int main(int argc, char *argv[]) { "ready for training; in that case they should later be processed\n" "with nnet3-chain-normalize-egs\n" "\n" - "Usage: nnet3-chain-get-egs [options] [] " - " \n" + "Usage: nnet3-chain-get-egs-post [options] [] " + " \n" "\n" "An example [where $feats expands to the actual features]:\n" - "chain-get-supervision [args] | \\\n" - " nnet3-chain-get-egs --left-context=25 --right-context=9 --num-frames=20 dir/normalization.fst \\\n" - " \"$feats\" ark,s,cs:- ark:cegs.1.ark\n" - "Note: the --frame-subsampling-factor option must be the same as given to\n" - "chain-get-supervision.\n"; + "nnet3-chain-get-egs-post --left-context=25 --right-context=9\n" + "--num-frames=20 dir/normalization.fst \"$feats\" \n" + "ark:lat.1.ark ark:cegs.1.ark"; bool compress = true; int32 length_tolerance = 100, online_ivector_period = 1; @@ -278,9 +270,7 @@ int main(int argc, char *argv[]) { po.Register("ivectors", &online_ivector_rspecifier, "Alias for " "--online-ivectors option, for back compatibility"); po.Register("online-ivectors", &online_ivector_rspecifier, "Rspecifier of " - "ivector features, as a matrix."); - po.Register("online-ivector-period", &online_ivector_period, "Number of " - "frames between iVectors in matrices supplied to the " + "ivector features, as a matrix." "--online-ivectors option"); po.Register("srand", &srand_seed, "Seed for random number generator "); po.Register("length-tolerance", &length_tolerance, "Tolerance for " @@ -376,8 +366,15 @@ int main(int argc, char *argv[]) { num_err++; continue; } + // we scale normalization fst to have similar weights used to combine lm weight + // with acoustic weight in sup_lat. + fst::StdVectorFst sup_fst, scaled_normalization_fst(normalization_fst); + ConvertLatticeToPdfLabels(tmodel, lat, &sup_fst); + ScaleFst(0.5, &scaled_normalization_fst); // Scale lattice to have weights similar + // to weights used to combine lm weight + // with acoustic weight in sup_lat int32 num_output_frames = 1; - if (!ProcessFile(normalization_fst, feats, + if (!ProcessFile(scaled_normalization_fst, feats, online_ivector_feats, online_ivector_period, lat, num_output_frames, key, compress, num_pdfs, tmodel, diff --git a/src/chainbin/nnet3-chain-get-egs.cc b/src/chainbin/nnet3-chain-get-egs.cc index 206921771c8..c8c251900ec 100644 --- a/src/chainbin/nnet3-chain-get-egs.cc +++ b/src/chainbin/nnet3-chain-get-egs.cc @@ -42,7 +42,6 @@ static bool ProcessFile(const fst::StdVectorFst &normalization_fst, const GeneralMatrix &feats, const MatrixBase *ivector_feats, int32 ivector_period, - const Lattice &lattice, const chain::Supervision &supervision, const std::string &utt_id, bool compress, @@ -279,13 +278,12 @@ int main(int argc, char *argv[]) { num_err++; continue; } - /* + if (!ProcessFile(normalization_fst, feats, online_ivector_feats, online_ivector_period, supervision, key, compress, &utt_splitter, &example_writer)) num_err++; - */ } } if (num_err > 0) From 6437c044e6d0a75f96bba06c9f21f337f7dbde09 Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Thu, 1 Mar 2018 18:16:25 -0500 Subject: [PATCH 129/174] chain-smbr: Minor bug fixes --- src/chain/chain-training.cc | 24 ++++++++++++++++-------- src/nnet3/nnet-chain-diagnostics.cc | 3 ++- src/nnet3/nnet-chain-training.cc | 4 +++- 3 files changed, 21 insertions(+), 10 deletions(-) diff --git a/src/chain/chain-training.cc b/src/chain/chain-training.cc index a14fedf84ea..68c26bc07a7 100644 --- a/src/chain/chain-training.cc +++ b/src/chain/chain-training.cc @@ -183,6 +183,22 @@ void ComputeChainSmbrObjfAndDeriv(const ChainTrainingOptions &opts, } } + + if (opts.smbr_threshold > 0) { + KALDI_ASSERT(opts.smbr_threshold > 1.0 / nnet_output.NumCols()); + + // Consider all posteriors below smbr_threshold to be 0. + CuMatrix tmp(numerator_post); + tmp.Add(-opts.smbr_threshold); + tmp.ApplyHeaviside(); + numerator_post.MulElements(tmp); + + CuVector normalizer(nnet_output.NumRows()); + normalizer.AddColSumMat(1.0, numerator_post); + normalizer.Add(1e-8); + numerator_post.DivRowsVec(normalizer); + } + if (sil_indices && opts.exclude_silence) { // Exclude numerator posteriors for silence pdfs from accuracy // computation. This is done by setting silence pdf posteriors to zero. @@ -203,14 +219,6 @@ void ComputeChainSmbrObjfAndDeriv(const ChainTrainingOptions &opts, numerator_post.CopyColsFromVec(total_silence_post, *sil_indices); } - if (opts.smbr_threshold > 0) { - // Consider all posteriors below smbr_threshold to be 0. - CuMatrix tmp(numerator_post); - tmp.Add(-opts.smbr_threshold); - tmp.ApplyHeaviside(); - numerator_post.MulElements(tmp); - } - DenominatorSmbrComputation denominator(opts, den_graph, supervision.num_sequences, nnet_output, numerator_post); diff --git a/src/nnet3/nnet-chain-diagnostics.cc b/src/nnet3/nnet-chain-diagnostics.cc index be83581044c..0e0fe03cd0c 100644 --- a/src/nnet3/nnet-chain-diagnostics.cc +++ b/src/nnet3/nnet-chain-diagnostics.cc @@ -274,7 +274,8 @@ void NnetChainComputeProb::ProcessOutputs(const NnetChainExample &eg, std::vector aux_objf_scales(1, objf_scale); // for l2 term if (chain_config_.use_smbr_objective) { this_objf_scale *= chain_config_.smbr_factor; - aux_objf_scales.push_back(objf_scale * chain_config_.mmi_factor); + aux_objf_scales.push_back( + objf_scale * (chain_config_.mmi_factor + chain_config_.ml_factor)); } ChainObjectiveInfo totals(this_objf_scale, aux_objf_scales); diff --git a/src/nnet3/nnet-chain-training.cc b/src/nnet3/nnet-chain-training.cc index df047031fd5..e92151830c0 100644 --- a/src/nnet3/nnet-chain-training.cc +++ b/src/nnet3/nnet-chain-training.cc @@ -359,7 +359,9 @@ void NnetChainTrainer::ProcessOutputs(bool is_backstitch_step2, std::vector aux_objf_scales(1, objf_scale); // l2_term if (opts_.chain_config.use_smbr_objective) { this_objf_scale *= opts_.chain_config.smbr_factor; - aux_objf_scales.push_back(objf_scale * opts_.chain_config.mmi_factor); + aux_objf_scales.push_back( + objf_scale * + (opts_.chain_config.mmi_factor + opts_.chain_config.ml_factor)); } ObjectiveFunctionInfo totals(objf_scale, aux_objf_scales); From 70b4d88b004732f67f5f44464494aa4be6c5842b Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Wed, 7 Mar 2018 17:24:05 -0500 Subject: [PATCH 130/174] Changes related to python3 --- egs/wsj/s5/steps/libs/__init__.py | 2 +- egs/wsj/s5/steps/libs/nnet3/report/__init__.py | 2 +- egs/wsj/s5/steps/libs/nnet3/report/log_parse.py | 2 +- egs/wsj/s5/steps/libs/nnet3/train/__init__.py | 2 ++ .../s5/steps/libs/nnet3/train/chain_objf/__init__.py | 2 +- egs/wsj/s5/steps/libs/nnet3/train/common.py | 5 +++-- .../s5/steps/libs/nnet3/train/dropout_schedule.py | 12 +++++++++--- egs/wsj/s5/steps/nnet3/chain/train.py | 10 +++++----- egs/wsj/s5/steps/nnet3/multilingual/combine_egs.sh | 2 +- 9 files changed, 24 insertions(+), 15 deletions(-) diff --git a/egs/wsj/s5/steps/libs/__init__.py b/egs/wsj/s5/steps/libs/__init__.py index 013c95d0b3f..b78141be659 100644 --- a/egs/wsj/s5/steps/libs/__init__.py +++ b/egs/wsj/s5/steps/libs/__init__.py @@ -6,6 +6,6 @@ """ This package contains modules and subpackages used in kaldi scripts. """ -import common +from . import common __all__ = ["common"] diff --git a/egs/wsj/s5/steps/libs/nnet3/report/__init__.py b/egs/wsj/s5/steps/libs/nnet3/report/__init__.py index 0566735d709..620c4238a22 100644 --- a/egs/wsj/s5/steps/libs/nnet3/report/__init__.py +++ b/egs/wsj/s5/steps/libs/nnet3/report/__init__.py @@ -3,6 +3,6 @@ # Copyright 2016 Vimal Manohar # Apache 2.0. -import log_parse +from . import log_parse __all__ = ["log_parse"] diff --git a/egs/wsj/s5/steps/libs/nnet3/report/log_parse.py b/egs/wsj/s5/steps/libs/nnet3/report/log_parse.py index dc8a7398520..5dc2faf07e1 100755 --- a/egs/wsj/s5/steps/libs/nnet3/report/log_parse.py +++ b/egs/wsj/s5/steps/libs/nnet3/report/log_parse.py @@ -439,7 +439,7 @@ def generate_acc_logprob_report(exp_dir, key="accuracy", output="output", get_sm try: report.append("%d\t%s\t%g\t%g\t%g" % (x[0], str(times[x[0]]), x[1], x[2], x[2]-x[1])) - except KeyError, IndexError: + except (KeyError, IndexError): continue total_time = 0 diff --git a/egs/wsj/s5/steps/libs/nnet3/train/__init__.py b/egs/wsj/s5/steps/libs/nnet3/train/__init__.py index 0503c0135cd..9679634658d 100644 --- a/egs/wsj/s5/steps/libs/nnet3/train/__init__.py +++ b/egs/wsj/s5/steps/libs/nnet3/train/__init__.py @@ -9,3 +9,5 @@ frame_level_objf -- For both recurrent and non-recurrent architectures chain_objf -- LF-MMI objective training """ + +from . import common, dropout_schedule diff --git a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/__init__.py b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/__init__.py index f6cb292e829..d7fccd8bbe6 100644 --- a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/__init__.py +++ b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/__init__.py @@ -7,6 +7,6 @@ deep neural network acoustic model with chain objective. """ -import acoustic_model +from . import acoustic_model __all__ = ["acoustic_model"] diff --git a/egs/wsj/s5/steps/libs/nnet3/train/common.py b/egs/wsj/s5/steps/libs/nnet3/train/common.py index bd7301a7997..7435a34111a 100644 --- a/egs/wsj/s5/steps/libs/nnet3/train/common.py +++ b/egs/wsj/s5/steps/libs/nnet3/train/common.py @@ -14,11 +14,12 @@ import os import math import re +import sys import shutil import libs.common as common_lib -import libs.nnet3.train.dropout_schedule as dropout_schedule -from dropout_schedule import * +import libs.nnet3.train.dropout_schedule +from libs.nnet3.train.dropout_schedule import * logger = logging.getLogger(__name__) logger.addHandler(logging.NullHandler()) diff --git a/egs/wsj/s5/steps/libs/nnet3/train/dropout_schedule.py b/egs/wsj/s5/steps/libs/nnet3/train/dropout_schedule.py index 9d3934873d4..2471abf040f 100644 --- a/egs/wsj/s5/steps/libs/nnet3/train/dropout_schedule.py +++ b/egs/wsj/s5/steps/libs/nnet3/train/dropout_schedule.py @@ -210,14 +210,20 @@ def _get_dropout_proportions(dropout_schedule, data_fraction): return dropout_proportions -def get_schedule_value(schedule, data_fraction): +def get_schedule_string(schedule, data_fraction): if schedule is None: return 0 proportions = _get_dropout_proportions( schedule, data_fraction) - assert len(proportions) == 1 - assert len(proportions[0]) == 2 and proportions[0][0] == '*' + proportion_string = [] + + for component_name, proportion in proportions: + proportion_string.append( + "{}:{}".format(component_name, proportion)) + + ' '.join(proportion_string) + return proportions[0][1] diff --git a/egs/wsj/s5/steps/nnet3/chain/train.py b/egs/wsj/s5/steps/nnet3/chain/train.py index d216a717d75..e8a72641277 100755 --- a/egs/wsj/s5/steps/nnet3/chain/train.py +++ b/egs/wsj/s5/steps/nnet3/chain/train.py @@ -577,7 +577,7 @@ def train(args, run_opts): if args.objective_scales is not None else "") smbr_factor = 0.0 if args.smbr_factor_schedule is not None: - smbr_factor = common_train_lib.get_schedule_value( + smbr_factor = common_train_lib.get_schedule_string( args.smbr_factor_schedule, float(num_archives_processed) / num_archives_to_process) @@ -598,14 +598,14 @@ def train(args, run_opts): objective_opts += " " + args.smbr_extra_opts if args.mmi_factor_schedule is not None: - mmi_factor = common_train_lib.get_schedule_value( + mmi_factor = common_train_lib.get_schedule_string( args.mmi_factor_schedule, float(num_archives_processed) / num_archives_to_process) objective_opts += " --mmi-factor={0}".format(mmi_factor) if args.ml_factor_schedule is not None: - ml_factor = common_train_lib.get_schedule_value( + ml_factor = common_train_lib.get_schedule_string( args.ml_factor_schedule, float(num_archives_processed) / num_archives_to_process) @@ -690,7 +690,7 @@ def train(args, run_opts): if args.objective_scales is not None else "") smbr_factor = 0.0 if args.smbr_factor_schedule is not None: - smbr_factor = common_train_lib.get_schedule_value( + smbr_factor = common_train_lib.get_schedule_string( args.smbr_factor_schedule, 1.0) objective_opts += " --smbr-factor={0}".format(smbr_factor) @@ -708,7 +708,7 @@ def train(args, run_opts): objective_opts += " --silence-pdfs=" + silence_pdfs if args.mmi_factor_schedule is not None: - mmi_factor = common_train_lib.get_schedule_value( + mmi_factor = common_train_lib.get_schedule_string( args.mmi_factor_schedule, 1.0) objective_opts += " --mmi-factor={0}".format(mmi_factor) diff --git a/egs/wsj/s5/steps/nnet3/multilingual/combine_egs.sh b/egs/wsj/s5/steps/nnet3/multilingual/combine_egs.sh index 1e708d58915..b080b548a0a 100755 --- a/egs/wsj/s5/steps/nnet3/multilingual/combine_egs.sh +++ b/egs/wsj/s5/steps/nnet3/multilingual/combine_egs.sh @@ -109,7 +109,7 @@ for lang in $(seq 0 $[$num_langs-1]);do rm -f $megs_dir/lang${lang}_${egs_prefix}scp $megs_dir/lang${lang}_train_diagnostic.scp \ $megs_dir/lang${lang}_valid_diagnostic.scp $megs_dir/lang${lang}_combine.scp - if [ `echo ${num_copies_per_lang[$lang]} | awk "{print int($num_copies_per_lang)}"` != ${num_copies_per_lang[$lang]} ]; then + if [ $(perl -e "{print int(${num_copies_per_lang[$lang]})}") != ${num_copies_per_lang[$lang]} ]; then echo "$0: Expected --lang2num-copies to have only integers; " echo "$0: got ${num_copies_per_lang[$lang]} for language $lang" exit 1 From addb032dc6d020e5af362aad836aabcd3f623795 Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Thu, 8 Mar 2018 17:00:37 -0500 Subject: [PATCH 131/174] Fix subsampling factor in nnet3 egs --- src/nnet3bin/nnet3-get-egs-dense-targets.cc | 2 +- src/nnet3bin/nnet3-get-egs.cc | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/nnet3bin/nnet3-get-egs-dense-targets.cc b/src/nnet3bin/nnet3-get-egs-dense-targets.cc index c62d453d1f2..a1902071b60 100644 --- a/src/nnet3bin/nnet3-get-egs-dense-targets.cc +++ b/src/nnet3bin/nnet3-get-egs-dense-targets.cc @@ -127,7 +127,7 @@ static bool ProcessFile(const GeneralMatrix &feats, } // push this created targets matrix into the eg - eg.io.push_back(NnetIo("output", 0, targets_part)); + eg.io.push_back(NnetIo("output", 0, targets_part, frame_subsampling_factor)); if (compress) eg.Compress(); diff --git a/src/nnet3bin/nnet3-get-egs.cc b/src/nnet3bin/nnet3-get-egs.cc index cec9549541d..fed6d529a82 100644 --- a/src/nnet3bin/nnet3-get-egs.cc +++ b/src/nnet3bin/nnet3-get-egs.cc @@ -119,7 +119,7 @@ static bool ProcessFile(const GeneralMatrix &feats, iter->second *= chunk.output_weights[i]; } - eg.io.push_back(NnetIo("output", num_pdfs, 0, labels)); + eg.io.push_back(NnetIo("output", num_pdfs, 0, labels, frame_subsampling_factor)); if (compress) eg.Compress(); From 1f024bdc844c31656cf18628dc9612165c805ad0 Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Mon, 12 Mar 2018 19:23:44 -0400 Subject: [PATCH 132/174] Fix for segmentation --- egs/aspire/s5/local/run_asr_segmentation.sh | 14 +----- egs/babel/s5d/local/run_asr_segmentation.sh | 14 ++---- egs/swbd/s5c/local/run_asr_segmentation.sh | 12 +---- .../merge_segment_targets_to_recording.py | 45 ++++++++++++------- .../steps/segmentation/prepare_targets_gmm.sh | 19 +++++--- 5 files changed, 48 insertions(+), 56 deletions(-) diff --git a/egs/aspire/s5/local/run_asr_segmentation.sh b/egs/aspire/s5/local/run_asr_segmentation.sh index 731b6721a78..a5928c1d1fa 100755 --- a/egs/aspire/s5/local/run_asr_segmentation.sh +++ b/egs/aspire/s5/local/run_asr_segmentation.sh @@ -107,20 +107,8 @@ if [ $stage -le 1 ]; then fi ############################################################################### -# Get feats for the manual segments +# Prepare SAD targets for recordings ############################################################################### -if [ $stage -le 2 ]; then - if [ ! -f ${data_dir}/segments ]; then - utils/data/get_segments_for_data.sh $data_dir > $data_dir/segments - fi - utils/data/subsegment_data_dir.sh $whole_data_dir ${data_dir}/segments ${data_dir}/tmp - cp $data_dir/tmp/feats.scp $data_dir - - # Use recording as the "speaker". This is required by prepare_targets_gmm.sh script. - awk '{print $1" "$2}' $data_dir/segments > $data_dir/utt2spk - utils/utt2spk_to_spk2utt.pl $data_dir/utt2spk > $data_dir/spk2utt -fi - if [ $stage -le 3 ]; then steps/segmentation/prepare_targets_gmm.sh --stage $prepare_targets_stage \ --train-cmd "$train_cmd" --decode-cmd "$decode_cmd" \ diff --git a/egs/babel/s5d/local/run_asr_segmentation.sh b/egs/babel/s5d/local/run_asr_segmentation.sh index 7bfc3fd60ca..4240d25b562 100755 --- a/egs/babel/s5d/local/run_asr_segmentation.sh +++ b/egs/babel/s5d/local/run_asr_segmentation.sh @@ -97,21 +97,13 @@ if [ $stage -le 1 ]; then steps/make_plp.sh --cmd "$train_cmd" --nj $nj --write-utt2num-frames true \ ${whole_data_dir} || exit 1 fi + steps/compute_cmvn_stats.sh $whole_data_dir + utils/fix_data_dir.sh $whole_data_dir fi ############################################################################### -# Get feats for the manual segments +# Prepare SAD targets for recordings ############################################################################### -if [ $stage -le 2 ]; then - if [ ! -f ${data_dir}/segments ]; then - utils/data/get_segments_for_data.sh $data_dir > $data_dir/segments - fi - utils/data/subsegment_data_dir.sh $whole_data_dir ${data_dir}/segments ${data_dir}/tmp - cp $data_dir/tmp/feats.scp $data_dir - awk '{print $1" "$2}' $data_dir/segments > $data_dir/utt2spk - utils/utt2spk_to_spk2utt.pl $data_dir/utt2spk > $data_dir/spk2utt -fi - if [ $stage -le 3 ]; then steps/segmentation/prepare_targets_gmm.sh --stage $prepare_targets_stage \ --train-cmd "$train_cmd" --decode-cmd "$decode_cmd" \ diff --git a/egs/swbd/s5c/local/run_asr_segmentation.sh b/egs/swbd/s5c/local/run_asr_segmentation.sh index af7d3a428ba..1a405bc5772 100755 --- a/egs/swbd/s5c/local/run_asr_segmentation.sh +++ b/egs/swbd/s5c/local/run_asr_segmentation.sh @@ -101,18 +101,8 @@ if [ $stage -le 1 ]; then fi ############################################################################### -# Get feats for the manual segments +# Prepare SAD targets for recordings ############################################################################### -if [ $stage -le 2 ]; then - if [ ! -f ${data_dir}/segments ]; then - utils/data/get_segments_for_data.sh $data_dir > $data_dir/segments - fi - utils/data/subsegment_data_dir.sh $whole_data_dir ${data_dir}/segments ${data_dir}/tmp - cp $data_dir/tmp/feats.scp $data_dir - awk '{print $1" "$2}' $data_dir/segments > $data_dir/utt2spk - utils/utt2spk_to_spk2utt.pl $data_dir/utt2spk > $data_dir/spk2utt -fi - if [ $stage -le 3 ]; then steps/segmentation/prepare_targets_gmm.sh --stage $prepare_targets_stage \ --train-cmd "$train_cmd" --decode-cmd "$decode_cmd" \ diff --git a/egs/wsj/s5/steps/segmentation/internal/merge_segment_targets_to_recording.py b/egs/wsj/s5/steps/segmentation/internal/merge_segment_targets_to_recording.py index f8d5008c3e9..8c53e5e8db9 100755 --- a/egs/wsj/s5/steps/segmentation/internal/merge_segment_targets_to_recording.py +++ b/egs/wsj/s5/steps/segmentation/internal/merge_segment_targets_to_recording.py @@ -165,6 +165,8 @@ def run(args): axis=0) utts.sort(key=lambda x: segments[x][1]) # sort on start time + end_frame_accounted = 0 + for i, utt in enumerate(utts): if utt not in segments or utt not in targets: num_utt_err += 1 @@ -208,45 +210,58 @@ def run(args): num_utt_err += 1 continue + # Fix end_frame and num_frames if the segment goes beyond + # the length of the recording. if end_frame > reco2num_frames[reco]: end_frame = reco2num_frames[reco] num_frames = end_frame - start_frame - if num_frames < 0: + # Fix "num_frames" and "end_frame" if "num_frames" is lower + # than the size of the targets matrix "mat" + num_frames = min(num_frames, mat.shape[0]) + end_frame = start_frame + num_frames + + if num_frames <= 0: logger.warning("For utterance {utt}, start-frame {start} " "is outside the recording" "".format(utt=utt, start=start_frame)) num_utt_err += 1 continue - prev_utt_end_frame = ( - int(segments[utts[i-1]][2] / args.frame_shift + 0.5) - if i > 0 else 0) - if start_frame < prev_utt_end_frame: - # Segment overlaps with the previous utterance + if end_frame < end_frame_accounted: + logger.warning("For utterance {utt}, end-frame {end} " + "is before the end of a previous segment. " + "i.e. this segment is completely within " + "another segment. Ignoring this segment." + "".format(utt=utt, end=end_frame)) + num_utt_err +=1 + continue + + if start_frame < end_frame_accounted: + # Segment overlaps with a previous utterance # Combine targets using a weighted interpolation using a # triangular window with a weight of 1 at the start/end of # overlap and 0 at the end/start of the segment - for n in range(0, prev_utt_end_frame - start_frame): - w = float(n) / float(prev_utt_end_frame - start_frame) + for n in range(0, end_frame_accounted - start_frame): + w = float(n) / float(end_frame_accounted - start_frame) reco_mat[n + start_frame, :] = ( reco_mat[n + start_frame, :] * (1.0 - w) + mat[n, :] * w) - num_frames = min(num_frames, mat.shape[0]) - end_frame = start_frame + num_frames - reco_mat[prev_utt_end_frame:end_frame, :] = ( - mat[(prev_utt_end_frame-start_frame): - (end_frame-start_frame), :]) + if end_frame > end_frame_accounted: + reco_mat[end_frame_accounted:end_frame, :] = ( + mat[(end_frame_accounted-start_frame): + (end_frame-start_frame), :]) else: # No overlap with the previous utterances. # So just add it to the output. - num_frames = min(num_frames, mat.shape[0]) - reco_mat[start_frame:(start_frame + num_frames), :] = ( + reco_mat[start_frame:end_frame, :] = ( mat[0:num_frames, :]) logger.debug("reco_mat shape = %s, mat shape = %s, " "start_frame = %d, end_frame = %d", reco_mat.shape, mat.shape, start_frame, end_frame) + + end_frame_accounted = end_frame num_utt += 1 if reco_mat.shape[0] > 0: diff --git a/egs/wsj/s5/steps/segmentation/prepare_targets_gmm.sh b/egs/wsj/s5/steps/segmentation/prepare_targets_gmm.sh index f8557a70177..d2b03e0c72e 100755 --- a/egs/wsj/s5/steps/segmentation/prepare_targets_gmm.sh +++ b/egs/wsj/s5/steps/segmentation/prepare_targets_gmm.sh @@ -66,8 +66,10 @@ if [ $# -ne 6 ]; then Usage: $0 e.g.: $0 data/lang data/train data/train_whole exp/tri5 exp/tri4 exp/segmentation_1a - Note: Both and must have the recording-id - as speaker, and must contain feats.scp. + Note: is expected to have feats.scp and + expected to have segments file. We will get the features for by + using row ranges of /feats.scp. This script will + work on a copy of created to have the recording-id as the speaker-id. EOF exit 1 fi @@ -97,8 +99,7 @@ else extra_files="$extra_files $graph_dir/HCLG.fst $graph_dir/phones.txt" fi -for f in $in_data_dir/feats.scp $in_whole_data_dir/feats.scp \ - $in_data_dir/segments \ +for f in $in_whole_data_dir/feats.scp $in_data_dir/segments \ $lang/phones.txt $garbage_phones_list $silence_phones_list \ $ali_model_dir/final.mdl $model_dir/final.mdl $extra_files; do if [ ! -f $f ]; then @@ -125,8 +126,7 @@ if [ $stage -le 0 ]; then utils/data/modify_speaker_info_to_recording.sh \ $in_data_dir $dir/$data_id || exit 1 - steps/compute_cmvn_stats.sh $dir/$data_id || exit 1 - utils/validate_data_dir.sh $dir/$data_id || exit 1 + utils/validate_data_dir.sh --no-spk-sort --no-feats $dir/$data_id || exit 1 fi # Work with a temporary data directory with recording-id as the speaker labels. @@ -135,6 +135,13 @@ data_dir=$dir/${data_id} ############################################################################### # Get feats for the manual segments ############################################################################### +if [ $stage -le 1 ]; then + utils/data/subsegment_data_dir.sh $in_whole_data_dir ${data_dir}/segments ${data_dir}/tmp + cp $data_dir/tmp/feats.scp $data_dir + + steps/compute_cmvn_stats.sh $data_dir || exit 1 +fi + if [ $stage -le 2 ]; then utils/copy_data_dir.sh $in_whole_data_dir $dir/$whole_data_id From ad3fe8ff0394ab82a6a1dc666a65bab39079f2c3 Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Mon, 12 Mar 2018 20:55:04 -0400 Subject: [PATCH 133/174] ts learning from post --- src/chain/chain-training.cc | 8 ++++---- src/chainbin/Makefile | 3 ++- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/src/chain/chain-training.cc b/src/chain/chain-training.cc index 98410d58e23..f104d446875 100644 --- a/src/chain/chain-training.cc +++ b/src/chain/chain-training.cc @@ -53,7 +53,7 @@ void ComputeKLObjfAndDeriv(const ChainTrainingOptions &opts, den_logprob_weighted = supervision_weight * denominator.Forward(); if (nnet_output_deriv) - ok = denominator.Backward(supervision_weight, + ok = denominator.Backward(-supervision_weight, nnet_output_deriv); } @@ -65,13 +65,13 @@ void ComputeKLObjfAndDeriv(const ChainTrainingOptions &opts, // shape). xent_output_deriv->Resize(nnet_output.NumRows(), nnet_output.NumCols(), kSetZero, kStrideEqualNumCols); - xent_output_deriv->CopyFromMat(supervision.GetFullMatrix()); + supervision.CopyToMat(xent_output_deriv); xent_output_deriv->Scale(supervision_weight); if (nnet_output_deriv) nnet_output_deriv->AddMat(1.0, *xent_output_deriv); } else if (nnet_output_deriv) { CuMatrix numerator_post(nnet_output.NumRows(), nnet_output.NumCols()); - numerator_post.CopyFromMat(supervision.GetFullMatrix()); + supervision.CopyToMat(&numerator_post); nnet_output_deriv->AddMat(supervision_weight, numerator_post); } @@ -226,7 +226,7 @@ void ComputeChainSmbrObjfAndDeriv(const ChainTrainingOptions &opts, BaseFloat *l2_term, BaseFloat *weight, CuMatrixBase *nnet_output_deriv, - CuMatrixBase *xent_output_deriv, + CuMatrix *xent_output_deriv, const CuArray *sil_indices) { // num_posteriors is a matrix of size // (num_sequences * frames_per_sequence) x num_pdfs and is ordered in the diff --git a/src/chainbin/Makefile b/src/chainbin/Makefile index 8a7a9bf2c0f..e721e35b24a 100644 --- a/src/chainbin/Makefile +++ b/src/chainbin/Makefile @@ -12,7 +12,8 @@ BINFILES = chain-est-phone-lm chain-get-supervision chain-make-den-fst \ nnet3-chain-acc-lda-stats nnet3-chain-train nnet3-chain-compute-prob \ nnet3-chain-combine nnet3-chain-normalize-egs \ nnet3-chain-split-and-get-egs chain-split-lattices \ - nnet3-chain-split-convert-and-get-egs + nnet3-chain-split-convert-and-get-egs \ + nnet3-chain-train-post nnet3-chain-compute-prob-post OBJFILES = From a511b0f45c29070b7c98f01ef7abd6112a08055e Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Mon, 12 Mar 2018 20:55:20 -0400 Subject: [PATCH 134/174] update aspire sad --- egs/aspire/s5/local/run_asr_segmentation.sh | 22 +++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/egs/aspire/s5/local/run_asr_segmentation.sh b/egs/aspire/s5/local/run_asr_segmentation.sh index 731b6721a78..bd6873836d7 100755 --- a/egs/aspire/s5/local/run_asr_segmentation.sh +++ b/egs/aspire/s5/local/run_asr_segmentation.sh @@ -41,14 +41,15 @@ train_stage=-10 test_stage=-10 num_data_reps=3 affix=_1a # For segmentation -test_affix=1a +test_affix=1a_silscale0.05 +nnet_affix=1a stage=-1 nj=80 reco_nj=40 # test options test_nj=30 -test_stage=1 +test_stage=0 . ./cmd.sh if [ -f ./path.sh ]; then . ./path.sh; fi @@ -185,8 +186,9 @@ if [ $stage -le 6 ]; then rm -r ${rvb_targets_dirs[@]} fi -sad_nnet_dir=exp/segmentation${affix}/tdnn_stats_asr_sad_1a -#sad_nnet_dir=exp/segmentation${affix}/tdnn_lstm_asr_sad_1a +sad_nnet_dir=exp/segmentation${affix}/tdnn_stats_asr_sad_$nnet_affix +sad_opts="--extra-left-context 79 --extra-right-context 21 --frames-per-chunk 150 --extra-left-context-initial 0 --extra-right-context-final 0 --acwt 0.3" +#sad_nnet_dir=exp/segmentation${affix}/tdnn_lstm_asr_sad_$nnet_affix #sad_opts="--extra-left-context 70 --extra-right-context 0 --frames-per-chunk 150 --extra-left-context-initial 0 --extra-right-context-final 0 --acwt 0.3" if [ $stage -le 7 ]; then @@ -194,13 +196,13 @@ if [ $stage -le 7 ]; then local/segmentation/tuning/train_stats_asr_sad_1a.sh \ --stage $nstage --train-stage $train_stage \ --targets-dir ${rvb_targets_dir} \ - --data-dir ${rvb_data_dir} --affix "1a" || exit 1 + --data-dir ${rvb_data_dir} --affix "$nnet_affix" || exit 1 # # Train a TDNN+LSTM network for SAD # local/segmentation/tuning/train_lstm_asr_sad_1a.sh \ # --stage $nstage --train-stage $train_stage \ # --targets-dir ${rvb_targets_dir} \ - # --data-dir ${rvb_data_dir} --affix "1a" || exit 1 + # --data-dir ${rvb_data_dir} --affix "$nnet_affix" || exit 1 fi if [ ! -f data/dev_aspire/wav.scp ]; then @@ -220,15 +222,15 @@ if [ $stage -le 9 ]; then # Use left and right context options that were used when training # the chain nnet # Increase sil-scale to predict more silence - local/nnet3/prep_test_aspire_segmentation.sh --stage $test_stage \ - --decode-num-jobs $test_nj --affix "${test_affix}" \ + local/nnet3/segment_and_decode.sh --stage $test_stage \ + --decode-num-jobs $test_nj --sad-affix "${test_affix}" --affix "${test_affix}" \ --sad-opts "$sad_opts" \ - --sad-graph-opts "--min-silence-duration=0.03 --min-speech-duration=0.3 --max-speech-duration=10.0" --sad-priors-opts "--sil-scale=0.1" \ + --sad-graph-opts "--min-silence-duration=0.03 --min-speech-duration=0.3 --max-speech-duration=10.0" --sad-priors-opts "--sil-scale=0.05" \ --acwt 1.0 --post-decode-acwt 10.0 \ --extra-left-context 50 \ --extra-right-context 0 \ --extra-left-context-initial 0 --extra-right-context-final 0 \ --sub-speaker-frames 6000 --max-count 75 \ --decode-opts "--min-active 1000" \ - dev_aspire $sad_nnet_dir $sad_nnet_dir data/lang $chain_dir/graph_pp $chain_dir + dev_aspire_ldc $sad_nnet_dir $sad_nnet_dir data/lang $chain_dir/graph_pp $chain_dir fi From be6b95a0312590fd61cfdcbd9b9e503db1e0bb75 Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Tue, 13 Mar 2018 18:52:39 -0400 Subject: [PATCH 135/174] Support multiple smbr-factors for outputs --- egs/wsj/s5/steps/nnet3/chain/train.py | 71 ++++++++++++------ src/nnet3/nnet-chain-training.cc | 101 +++++++++++--------------- src/nnet3/nnet-utils.cc | 20 +++++ src/nnet3/nnet-utils.h | 3 + 4 files changed, 113 insertions(+), 82 deletions(-) diff --git a/egs/wsj/s5/steps/nnet3/chain/train.py b/egs/wsj/s5/steps/nnet3/chain/train.py index e8a72641277..c8240b444a8 100755 --- a/egs/wsj/s5/steps/nnet3/chain/train.py +++ b/egs/wsj/s5/steps/nnet3/chain/train.py @@ -573,18 +573,22 @@ def train(args, run_opts): xent_regularize = args.xent_regularize l2_regularize = args.l2_regularize - objective_opts = ("--objective-scales=" + args.objective_scales - if args.objective_scales is not None else "") - smbr_factor = 0.0 + objective_opts = "" + + use_smbr_objective = False if args.smbr_factor_schedule is not None: - smbr_factor = common_train_lib.get_schedule_string( + smbr_factors = common_train_lib.get_schedule_string( args.smbr_factor_schedule, float(num_archives_processed) / num_archives_to_process) - objective_opts += " --smbr-factor={0}".format(smbr_factor) + objective_opts += " --smbr-factors='{0}'".format(smbr_factors) + for factor in smbr_factors.split(): + parts = factor.split(":") + if parts[1] > 0.0: + use_smbr_objective = True + break - if smbr_factor > 0.0: - use_smbr=True + if use_smbr_objective: xent_regularize = (args.smbr_xent_regularize if args.smbr_xent_regularize is not None else args.xent_regularize) @@ -598,18 +602,18 @@ def train(args, run_opts): objective_opts += " " + args.smbr_extra_opts if args.mmi_factor_schedule is not None: - mmi_factor = common_train_lib.get_schedule_string( + mmi_factors = common_train_lib.get_schedule_string( args.mmi_factor_schedule, float(num_archives_processed) / num_archives_to_process) - objective_opts += " --mmi-factor={0}".format(mmi_factor) + objective_opts += " --mmi-factors='{0}'".format(mmi_factors) if args.ml_factor_schedule is not None: - ml_factor = common_train_lib.get_schedule_string( + ml_factors = common_train_lib.get_schedule_string( args.ml_factor_schedule, float(num_archives_processed) / num_archives_to_process) - objective_opts += " --ml-factor={0}".format(ml_factor) + objective_opts += " --ml-factors='{0}'".format(ml_factors) objective_opts += " --norm-regularize={0}".format( "true" if args.norm_regularize else "false") @@ -649,7 +653,7 @@ def train(args, run_opts): l2_regularize=l2_regularize, xent_regularize=xent_regularize, leaky_hmm_coefficient=(args.smbr_leaky_hmm_coefficient - if smbr_factor > 0.0 and args.smbr_leaky_hmm_coefficient is not None + if use_smbr_objective and args.smbr_leaky_hmm_coefficient is not None else args.leaky_hmm_coefficient), momentum=args.momentum, max_param_change=args.max_param_change, @@ -688,30 +692,49 @@ def train(args, run_opts): l2_regularize = args.l2_regularize objective_opts = ("--objective-scales=" + args.objective_scales if args.objective_scales is not None else "") - smbr_factor = 0.0 - if args.smbr_factor_schedule is not None: - smbr_factor = common_train_lib.get_schedule_string( - args.smbr_factor_schedule, 1.0) - objective_opts += " --smbr-factor={0}".format(smbr_factor) - - if smbr_factor > 0.0: - use_smbr=True + use_smbr_objective = False + if args.smbr_factor_schedule is not None: + smbr_factors = common_train_lib.get_schedule_string( + args.smbr_factor_schedule, + float(num_archives_processed) / num_archives_to_process) + + objective_opts += " --smbr-factors='{0}'".format(smbr_factors) + for factor in smbr_factors.split(): + parts = factor.split(":") + if parts[1] > 0.0: + use_smbr_objective = True + break + + if use_smbr_objective: xent_regularize = (args.smbr_xent_regularize if args.smbr_xent_regularize is not None else args.xent_regularize) l2_regularize = (args.smbr_l2_regularize if args.smbr_l2_regularize is not None else args.l2_regularize) - objective_opts = "--use-smbr-objective" + objective_opts += " --use-smbr-objective" if silence_pdfs is not None: objective_opts += " --silence-pdfs=" + silence_pdfs + if args.smbr_extra_opts is not None: + objective_opts += " " + args.smbr_extra_opts if args.mmi_factor_schedule is not None: - mmi_factor = common_train_lib.get_schedule_string( - args.mmi_factor_schedule, 1.0) + mmi_factors = common_train_lib.get_schedule_string( + args.mmi_factor_schedule, + float(num_archives_processed) / num_archives_to_process) + + objective_opts += " --mmi-factors='{0}'".format(mmi_factors) + + if args.ml_factor_schedule is not None: + ml_factors = common_train_lib.get_schedule_string( + args.ml_factor_schedule, + float(num_archives_processed) / num_archives_to_process) + + objective_opts += " --ml-factors='{0}'".format(ml_factors) - objective_opts += " --mmi-factor={0}".format(mmi_factor) + objective_opts += " --norm-regularize={0}".format( + "true" if args.norm_regularize else "false") if args.do_final_combination: logger.info("Doing final combination to produce final.mdl") diff --git a/src/nnet3/nnet-chain-training.cc b/src/nnet3/nnet-chain-training.cc index e92151830c0..e205b842747 100644 --- a/src/nnet3/nnet-chain-training.cc +++ b/src/nnet3/nnet-chain-training.cc @@ -24,6 +24,7 @@ namespace kaldi { namespace nnet3 { + NnetChainTrainer::NnetChainTrainer(const NnetChainTrainingOptions &opts, const fst::StdVectorFst &den_fst, Nnet *nnet): @@ -99,25 +100,17 @@ NnetChainTrainer::NnetChainTrainer(const NnetChainTrainingOptions &opts, sil_indices_.CopyFromVec(indices); } - if (!opts.nnet_config.objective_scales_str.empty()) { - std::vector objectives_for_outputs; - SplitStringToVector(opts.nnet_config.objective_scales_str, ",", false, - &objectives_for_outputs); - std::vector::const_iterator it = objectives_for_outputs.begin(); - for (; it != objectives_for_outputs.end(); ++it) { - std::vector this_output_objective; - SplitStringToVector(*it, ":", false, - &this_output_objective); - - BaseFloat scale; - ConvertStringToReal(this_output_objective[1], &scale); - objective_scales_.insert( - std::make_pair(this_output_objective[0], scale)); - } - } + if (!opts.chain_config.smbr_factors_str.empty()) + ParseObjectiveScales(opts.chain_config.smbr_factors_str, + &smbr_factors_); + if (!opts.chain_config.mmi_factors_str.empty()) + ParseObjectiveScales(opts.chain_config.mmi_factors_str, + &mmi_factors_); + if (!opts.chain_config.ml_factors_str.empty()) + ParseObjectiveScales(opts.chain_config.ml_factors_str, + &ml_factors_); } - void NnetChainTrainer::Train(const NnetChainExample &chain_eg) { bool need_model_derivative = true; const NnetTrainerOptions &nnet_config = opts_.nnet_config; @@ -277,14 +270,32 @@ void NnetChainTrainer::ProcessOutputs(bool is_backstitch_step2, nnet_output.NumCols(), kUndefined); - bool use_xent = (opts_.chain_config.xent_regularize != 0.0); + chain::ChainTrainingOptions chain_config(opts_.chain_config); + + { + auto it = smbr_factors_.find(sup.name); + if (it != smbr_factors_.end()) + chain_config.smbr_factor = it->second; + } + { + auto it = mmi_factors_.find(sup.name); + if (it != mmi_factors_.end()) + chain_config.mmi_factor = it->second; + } + { + auto it = ml_factors_.find(sup.name); + if (it != ml_factors_.end()) + chain_config.ml_factor = it->second; + } + + bool use_xent = (chain_config.xent_regularize != 0.0); std::string xent_name = sup.name + "-xent"; // typically "output-xent". CuMatrix xent_deriv; BaseFloat tot_objf, tot_mmi_objf, tot_l2_term, tot_weight; - if (opts_.chain_config.use_smbr_objective) { - ComputeChainSmbrObjfAndDeriv(opts_.chain_config, den_graph_, + if (chain_config.use_smbr_objective) { + ComputeChainSmbrObjfAndDeriv(chain_config, den_graph_, sup.supervision, nnet_output, &tot_objf, &tot_mmi_objf, &tot_l2_term, &tot_weight, @@ -292,28 +303,13 @@ void NnetChainTrainer::ProcessOutputs(bool is_backstitch_step2, (use_xent ? &xent_deriv : NULL), sil_indices_.Dim() ? &sil_indices_ : NULL); } else { - ComputeChainObjfAndDeriv(opts_.chain_config, den_graph_, + ComputeChainObjfAndDeriv(chain_config, den_graph_, sup.supervision, nnet_output, &tot_objf, &tot_l2_term, &tot_weight, &nnet_output_deriv, (use_xent ? &xent_deriv : NULL)); } - BaseFloat objf_scale = 1.0; - { - unordered_map::iterator it = - objective_scales_.find(sup.name); - - if (it != objective_scales_.end()) { - objf_scale = it->second; - tot_objf *= it->second; - tot_l2_term *= it->second; - tot_mmi_objf *= it->second; - tot_weight *= it->second; - nnet_output_deriv.Scale(it->second); - } - } - if (use_xent) { // this block computes the cross-entropy objective. const CuMatrixBase &xent_output = computer->GetOutput( @@ -321,17 +317,7 @@ void NnetChainTrainer::ProcessOutputs(bool is_backstitch_step2, // at this point, xent_deriv is posteriors derived from the numerator // computation. note, xent_objf has a factor of '.supervision.weight' BaseFloat xent_objf = TraceMatMat(xent_output, xent_deriv, kTrans); - - { - unordered_map::iterator it = - objective_scales_.find(xent_name); - - if (it != objective_scales_.end()) { - xent_objf *= it->second; - xent_deriv.Scale(it->second); - } - } - + objf_info_[xent_name + suffix].UpdateStats(xent_name + suffix, opts_.nnet_config.print_interval, num_minibatches_processed_, @@ -347,24 +333,23 @@ void NnetChainTrainer::ProcessOutputs(bool is_backstitch_step2, std::vector objective_values; objective_values.push_back(tot_l2_term); - if (opts_.chain_config.use_smbr_objective) + if (chain_config.use_smbr_objective) objective_values.push_back(tot_mmi_objf); { - unordered_map::iterator it + unordered_map::iterator it = objf_info_.find(sup.name + suffix); if (it == objf_info_.end()) { - BaseFloat this_objf_scale = objf_scale; - std::vector aux_objf_scales(1, objf_scale); // l2_term - if (opts_.chain_config.use_smbr_objective) { - this_objf_scale *= opts_.chain_config.smbr_factor; + BaseFloat this_objf_scale = 1.0; + std::vector aux_objf_scales(1, 1.0); // l2_term + if (chain_config.use_smbr_objective) { + this_objf_scale *= chain_config.smbr_factor; aux_objf_scales.push_back( - objf_scale * - (opts_.chain_config.mmi_factor + opts_.chain_config.ml_factor)); + (chain_config.mmi_factor + chain_config.ml_factor)); } - ObjectiveFunctionInfo totals(objf_scale, aux_objf_scales); + ObjectiveFunctionInfo totals(this_objf_scale, aux_objf_scales); it = objf_info_.insert(it, std::make_pair(sup.name + suffix, totals)); } @@ -384,7 +369,7 @@ void NnetChainTrainer::ProcessOutputs(bool is_backstitch_step2, computer->AcceptInput(sup.name, &nnet_output_deriv); if (use_xent) { - xent_deriv.Scale(opts_.chain_config.xent_regularize); + xent_deriv.Scale(chain_config.xent_regularize); if (opts_.accumulate_avg_deriv && objf_info_[xent_name + suffix].deriv_sum.Dim() == 0) objf_info_[xent_name + suffix].deriv_sum.Resize(nnet_output.NumCols()); @@ -404,7 +389,7 @@ bool NnetChainTrainer::PrintTotalStats() const { for (; iter != end; ++iter) { const std::string &name = iter->first; const ObjectiveFunctionInfo &info = iter->second; - + ans = info.PrintTotalStats(name) || ans; } PrintMaxChangeStats(); diff --git a/src/nnet3/nnet-utils.cc b/src/nnet3/nnet-utils.cc index fd2229cace8..4bea2d50cee 100644 --- a/src/nnet3/nnet-utils.cc +++ b/src/nnet3/nnet-utils.cc @@ -1986,6 +1986,26 @@ void ApplyL2Regularization(const Nnet &nnet, } } +void ParseObjectiveScales( + const std::string &objective_scales_str, + std::unordered_map *objective_scales) { + objective_scales->clear(); + + std::vector objectives_for_outputs; + SplitStringToVector(objective_scales_str, ", ", false, + &objectives_for_outputs); + std::vector::const_iterator it = objectives_for_outputs.begin(); + for (; it != objectives_for_outputs.end(); ++it) { + std::vector this_output_objective; + SplitStringToVector(*it, ":", false, + &this_output_objective); + + BaseFloat scale; + ConvertStringToReal(this_output_objective[1], &scale); + objective_scales->insert( + std::make_pair(this_output_objective[0], scale)); + } +} } // namespace nnet3 } // namespace kaldi diff --git a/src/nnet3/nnet-utils.h b/src/nnet3/nnet-utils.h index efa36e1f64c..10cc474eba9 100644 --- a/src/nnet3/nnet-utils.h +++ b/src/nnet3/nnet-utils.h @@ -485,6 +485,9 @@ void ConstrainOrthonormal(Nnet *nnet); int32 GetNumNvalues(const std::vector &io_vec, bool exhaustive); +void ParseObjectiveScales( + const std::string &objective_scales_str, + std::unordered_map *objective_scales); } // namespace nnet3 } // namespace kaldi From e5db432b02ed303f449529b3e4a7a9c96ac4c353 Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Tue, 20 Mar 2018 17:38:49 -0400 Subject: [PATCH 136/174] Re-organize sequence kl training --- src/chain/chain-supervision.cc | 60 ++++++++++- src/chain/chain-supervision.h | 5 + src/chain/chain-training.cc | 29 +++--- src/chain/chain-training.h | 19 ++-- src/chainbin/Makefile | 4 +- src/chainbin/nnet3-chain-acc-lda-stats.cc | 37 ++++--- src/chainbin/nnet3-chain-get-egs-post.cc | 108 +++++++++++++++----- src/chainbin/nnet3-chain-get-egs.cc | 2 +- src/latbin/lattice-determinize-pruned.cc | 3 +- src/matrix/sparse-matrix.cc | 17 ++-- src/nnet3/nnet-chain-diagnostics.cc | 49 +++++---- src/nnet3/nnet-chain-diagnostics.h | 14 +-- src/nnet3/nnet-chain-example.cc | 2 +- src/nnet3/nnet-chain-training.cc | 115 ++++++++++++++++++---- src/nnet3/nnet-chain-training.h | 10 +- src/nnet3/nnet-diagnostics.cc | 15 ++- src/nnet3/nnet-example-utils.cc | 65 +++++++++--- src/nnet3/nnet-example.cc | 33 ++++++- src/nnet3/nnet-example.h | 9 ++ 19 files changed, 455 insertions(+), 141 deletions(-) diff --git a/src/chain/chain-supervision.cc b/src/chain/chain-supervision.cc index a6222bc6593..192aef992f0 100644 --- a/src/chain/chain-supervision.cc +++ b/src/chain/chain-supervision.cc @@ -748,6 +748,10 @@ void Supervision::Write(std::ostream &os, bool binary) const { WriteToken(os, binary, ""); WriteBasicType(os, binary, e2e); if (!e2e) { + if (numerator_post_targets.NumRows() > 0) { + WriteToken(os, binary, ""); + numerator_post_targets.Write(os, binary); + } if (binary == false) { // In text mode, write the FST without any compactification. WriteFstKaldi(os, binary, fst); @@ -788,6 +792,7 @@ void Supervision::Swap(Supervision *other) { std::swap(fst, other->fst); std::swap(e2e, other->e2e); std::swap(e2e_fsts, other->e2e_fsts); + std::swap(numerator_post_targets, other->numerator_post_targets); } void Supervision::Read(std::istream &is, bool binary) { @@ -807,6 +812,15 @@ void Supervision::Read(std::istream &is, bool binary) { e2e = false; } if (!e2e) { + if (PeekToken(is, binary) == 'N') { + ExpectToken(is, binary, ""); + numerator_post_targets.Read(is, binary); + + if (PeekToken(is, binary) == '/') { + ExpectToken(is, binary, ""); + return; + } + } if (!binary) { ReadFstKaldi(is, binary, &fst); } else { @@ -874,11 +888,19 @@ int32 ComputeFstStateTimes(const fst::StdVectorFst &fst, return total_length; } +Supervision::Supervision(int32 dim, const Posterior &labels): + weight(1.0), num_sequences(1), frames_per_sequence(labels.size()), + label_dim(dim), e2e(false) { + SparseMatrix sparse_feats(dim, labels); + numerator_post_targets = sparse_feats; +} + Supervision::Supervision(const Supervision &other): weight(other.weight), num_sequences(other.num_sequences), frames_per_sequence(other.frames_per_sequence), label_dim(other.label_dim), fst(other.fst), - e2e(other.e2e), e2e_fsts(other.e2e_fsts) { } + e2e(other.e2e), e2e_fsts(other.e2e_fsts), + numerator_post_targets(other.numerator_post_targets) { } // This static function is called by AppendSupervision if the supervisions @@ -901,17 +923,46 @@ void AppendSupervisionE2e(const std::vector &input, } } +void AppendSupervisionPost(const std::vector &input, + std::vector *output_supervision) { + KALDI_ASSERT(!input.empty()); + int32 label_dim = input[0]->label_dim, + num_inputs = input.size(); + KALDI_ASSERT(num_inputs > 1); + KALDI_ASSERT(input[0]->numerator_post_targets.NumRows() > 0); + + KALDI_ASSERT(output_supervision->size() == 1); // otherwise not supported + KALDI_ASSERT((*output_supervision)[0].num_sequences == num_inputs); + + std::vector output_targets(num_inputs); + output_targets[0] = &(input[0]->numerator_post_targets); + + for (int32 i = 1; i < num_inputs; i++) { + output_targets[i] = &(input[i]->numerator_post_targets); + KALDI_ASSERT(output_targets[i]->NumRows() > 0); + KALDI_ASSERT(output_targets[i]->NumCols() == label_dim); + KALDI_ASSERT(input[i]->frames_per_sequence == + (*output_supervision)[0].frames_per_sequence); + } + + AppendGeneralMatrixRows( + output_targets, &((*output_supervision)[0].numerator_post_targets), + true); // sort by t +} + void AppendSupervision(const std::vector &input, bool compactify, std::vector *output_supervision) { KALDI_ASSERT(!input.empty()); int32 label_dim = input[0]->label_dim, num_inputs = input.size(); + KALDI_ASSERT(label_dim > 0); if (num_inputs == 1) { output_supervision->resize(1); (*output_supervision)[0] = *(input[0]); return; } + if (input[0]->e2e) { AppendSupervisionE2e(input, compactify, output_supervision); return; @@ -940,15 +991,22 @@ void AppendSupervision(const std::vector &input, output_was_merged.push_back(false); } } + KALDI_ASSERT(output_was_merged.size() == output_supervision->size()); for (size_t i = 0; i < output_supervision->size(); i++) { if (output_was_merged[i]) { fst::StdVectorFst &out_fst = (*output_supervision)[i].fst; // The process of concatenation will have introduced epsilons. fst::RmEpsilon(&out_fst); + if (input[0]->numerator_post_targets.NumRows() > 0 && out_fst.Start() < 0) + return; SortBreadthFirstSearch(&out_fst); } } + + if (input[0]->numerator_post_targets.NumRows() > 0) { + AppendSupervisionPost(input, output_supervision); + } } // This static function is called by AddWeightToSupervisionFst if the supervision diff --git a/src/chain/chain-supervision.h b/src/chain/chain-supervision.h index d273e029065..09b06012d09 100644 --- a/src/chain/chain-supervision.h +++ b/src/chain/chain-supervision.h @@ -30,6 +30,7 @@ #include "lat/kaldi-lattice.h" #include "fstext/deterministic-fst.h" #include "hmm/transition-model.h" +#include "hmm/posterior.h" namespace kaldi { namespace chain { @@ -275,9 +276,13 @@ struct Supervision { bool e2e; // end to end std::vector e2e_fsts; + GeneralMatrix numerator_post_targets; + Supervision(): weight(1.0), num_sequences(1), frames_per_sequence(-1), label_dim(-1), e2e(false) { } + Supervision(int32 dim, const Posterior &labels); + Supervision(const Supervision &other); void Swap(Supervision *other); diff --git a/src/chain/chain-training.cc b/src/chain/chain-training.cc index 2a63708ee1f..69c16c8be69 100644 --- a/src/chain/chain-training.cc +++ b/src/chain/chain-training.cc @@ -143,15 +143,16 @@ void ComputeChainObjfAndDerivE2e(const ChainTrainingOptions &opts, void ComputeKLObjfAndDeriv(const ChainTrainingOptions &opts, const DenominatorGraph &den_graph, - const GeneralMatrix &supervision, BaseFloat supervision_weight, + const Supervision &supervision, const CuMatrixBase &nnet_output, - int32 num_sequences, int32 frames_per_sequence, BaseFloat *objf, BaseFloat *l2_term, BaseFloat *weight, CuMatrixBase *nnet_output_deriv, CuMatrix *xent_output_deriv) { - KALDI_ASSERT(nnet_output.NumRows() == num_sequences * frames_per_sequence); + KALDI_ASSERT(supervision.numerator_post_targets.NumRows() > 0); + KALDI_ASSERT(nnet_output.NumRows() == supervision.num_sequences * supervision.frames_per_sequence); + KALDI_ASSERT(supervision.numerator_post_targets.NumRows() == nnet_output.NumRows()); BaseFloat den_logprob_weighted; bool ok = true; @@ -162,12 +163,12 @@ void ComputeKLObjfAndDeriv(const ChainTrainingOptions &opts, // memory use, as we can set 'xent_deriv' to nonempty after // we've freed the memory in this object. DenominatorComputation denominator(opts, den_graph, - num_sequences, + supervision.num_sequences, nnet_output); - den_logprob_weighted = supervision_weight * denominator.Forward(); + den_logprob_weighted = supervision.weight * denominator.Forward(); if (nnet_output_deriv) - ok = denominator.Backward(-supervision_weight, + ok = denominator.Backward(-supervision.weight, nnet_output_deriv); } @@ -179,18 +180,18 @@ void ComputeKLObjfAndDeriv(const ChainTrainingOptions &opts, // shape). xent_output_deriv->Resize(nnet_output.NumRows(), nnet_output.NumCols(), kSetZero, kStrideEqualNumCols); - supervision.CopyToMat(xent_output_deriv); - xent_output_deriv->Scale(supervision_weight); + supervision.numerator_post_targets.CopyToMat(xent_output_deriv); + xent_output_deriv->Scale(supervision.weight); if (nnet_output_deriv) nnet_output_deriv->AddMat(1.0, *xent_output_deriv); } else if (nnet_output_deriv) { CuMatrix numerator_post(nnet_output.NumRows(), nnet_output.NumCols()); - supervision.CopyToMat(&numerator_post); - nnet_output_deriv->AddMat(supervision_weight, numerator_post); + supervision.numerator_post_targets.CopyToMat(&numerator_post); + nnet_output_deriv->AddMat(supervision.weight, numerator_post); } *objf = -den_logprob_weighted; - *weight = supervision_weight * num_sequences * frames_per_sequence; + *weight = supervision.weight * supervision.num_sequences * supervision.frames_per_sequence; if (!((*objf) - (*objf) == 0) || !ok) { // inf or NaN detected, or denominator computation returned false. if (nnet_output_deriv) @@ -215,9 +216,9 @@ void ComputeKLObjfAndDeriv(const ChainTrainingOptions &opts, CuVector row_products(tot_frames); row_products.AddDiagMat2(1.0, *nnet_output_deriv, kNoTrans, 0.0); Vector row_products_cpu(row_products); - Vector row_products_per_frame(frames_per_sequence); + Vector row_products_per_frame(supervision.frames_per_sequence); for (int32 i = 0; i < tot_frames; i++) - row_products_per_frame(i / num_sequences) += row_products_cpu(i); + row_products_per_frame(i / supervision.num_sequences) += row_products_cpu(i); KALDI_LOG << "Derivs per frame are " << row_products_per_frame; } @@ -225,7 +226,7 @@ void ComputeKLObjfAndDeriv(const ChainTrainingOptions &opts, *l2_term = 0.0; } else { // compute the l2 penalty term and its derivative - BaseFloat scale = supervision_weight * opts.l2_regularize; + BaseFloat scale = supervision.weight * opts.l2_regularize; *l2_term = -0.5 * scale * TraceMatMat(nnet_output, nnet_output, kTrans); if (nnet_output_deriv) nnet_output_deriv->AddMat(-1.0 * scale, nnet_output); diff --git a/src/chain/chain-training.h b/src/chain/chain-training.h index 8c2946f41fa..4c0f3849bd4 100644 --- a/src/chain/chain-training.h +++ b/src/chain/chain-training.h @@ -210,16 +210,15 @@ void ComputeChainSmbrObjfAndDeriv( This function uses supervision as numerator and does denominator computation. It can be uses, where numerator is fixed e.g. TS learning. */ -void ComputeObjfAndDeriv2(const ChainTrainingOptions &opts, - const DenominatorGraph &den_graph, - const GeneralMatrix &supervision, - const CuMatrixBase &nnet_output, - int32 num_sequences, int32 frames_per_sequence, - BaseFloat *objf, - BaseFloat *l2_term, - BaseFloat *weight, - CuMatrixBase *nnet_output_deriv, - CuMatrixBase *xent_output_deriv = NULL); +void ComputeKLObjfAndDeriv(const ChainTrainingOptions &opts, + const DenominatorGraph &den_graph, + const Supervision &supervision, + const CuMatrixBase &nnet_output, + BaseFloat *objf, + BaseFloat *l2_term, + BaseFloat *weight, + CuMatrixBase *nnet_output_deriv, + CuMatrix *xent_output_deriv = NULL); } // namespace chain } // namespace kaldi diff --git a/src/chainbin/Makefile b/src/chainbin/Makefile index 24807eacb09..f3d125e2aa0 100644 --- a/src/chainbin/Makefile +++ b/src/chainbin/Makefile @@ -7,14 +7,14 @@ LDFLAGS += $(CUDA_LDFLAGS) LDLIBS += $(CUDA_LDLIBS) BINFILES = chain-est-phone-lm chain-get-supervision chain-make-den-fst \ - nnet3-chain-get-egs nnet3-chain-get-egs-post nnet3-chain-copy-egs nnet3-chain-merge-egs \ + nnet3-chain-get-egs nnet3-chain-copy-egs nnet3-chain-merge-egs \ nnet3-chain-shuffle-egs nnet3-chain-subset-egs \ nnet3-chain-acc-lda-stats nnet3-chain-train nnet3-chain-compute-prob \ nnet3-chain-combine nnet3-chain-normalize-egs \ nnet3-chain-e2e-get-egs \ nnet3-chain-split-and-get-egs chain-split-lattices \ nnet3-chain-split-convert-and-get-egs \ - nnet3-chain-train-post nnet3-chain-compute-prob-post + nnet3-chain-get-egs-post OBJFILES = diff --git a/src/chainbin/nnet3-chain-acc-lda-stats.cc b/src/chainbin/nnet3-chain-acc-lda-stats.cc index b195f5ba1fb..4cc41ba76ff 100644 --- a/src/chainbin/nnet3-chain-acc-lda-stats.cc +++ b/src/chainbin/nnet3-chain-acc-lda-stats.cc @@ -87,20 +87,31 @@ class NnetChainLdaStatsAccumulator { KALDI_ASSERT(num_frames == nnet_output.NumRows()); const fst::StdVectorFst &fst = supervision.fst; - Lattice lat; - // convert the FST to a lattice, putting all the weight on - // the graph weight. This is to save us having to implement the - // forward-backward on FSTs. - ConvertFstToLattice(fst, &lat); Posterior post; - LatticeForwardBackward(lat, &post); - KALDI_ASSERT(post.size() == static_cast(num_frames)); - - // Subtract one, to convert the (pdf-id + 1) which appears in the - // supervision FST, to a pdf-id. - for (size_t i = 0; i < post.size(); i++) - for (size_t j = 0; j < post[i].size(); j++) - post[i][j].first--; + if (supervision.numerator_post_targets.NumRows() > 0) { + const SparseMatrix &labels = supervision.numerator_post_targets.GetSparseMatrix(); + post.resize(labels.NumRows()); + for (size_t i = 0; i < labels.NumRows(); i++) { + post[i].resize(labels.Row(i).NumElements()); + for (size_t j = 0; j < labels.Row(i).NumElements(); j++) { + post[i][j] = labels.Row(i).GetElement(j); + } + } + } else { + Lattice lat; + // convert the FST to a lattice, putting all the weight on + // the graph weight. This is to save us having to implement the + // forward-backward on FSTs. + ConvertFstToLattice(fst, &lat); + LatticeForwardBackward(lat, &post); + KALDI_ASSERT(post.size() == static_cast(num_frames)); + + // Subtract one, to convert the (pdf-id + 1) which appears in the + // supervision FST, to a pdf-id. + for (size_t i = 0; i < post.size(); i++) + for (size_t j = 0; j < post[i].size(); j++) + post[i][j].first--; + } if (lda_stats_.Dim() == 0) lda_stats_.Init(num_pdfs, diff --git a/src/chainbin/nnet3-chain-get-egs-post.cc b/src/chainbin/nnet3-chain-get-egs-post.cc index 13dee599764..1cfe7d1cf6a 100644 --- a/src/chainbin/nnet3-chain-get-egs-post.cc +++ b/src/chainbin/nnet3-chain-get-egs-post.cc @@ -32,8 +32,9 @@ namespace kaldi { namespace nnet3 { -/** This function converts lattice to fst with weight equal to - sum of acoustic and language score. +/** This function converts lattice to FSA with weight equal to + sum of acoustic and language score, and pdf_id + 1 as labels. + This assumes that the acoustic and language scores are scaled appropriately. */ void ConvertLatticeToPdfLabels( const TransitionModel &tmodel, @@ -59,12 +60,15 @@ void ConvertLatticeToPdfLabels( for (fst::ArcIterator iter(ifst, s); !iter.Done(); iter.Next()) { - ArcIn arc = iter.Value(); + const ArcIn &arc = iter.Value(); KALDI_PARANOID_ASSERT(arc.weight != LatticeWeight::Zero()); ArcOut oarc; ConvertLatticeWeight(arc.weight, &oarc.weight); - oarc.ilabel = tmodel.TransitionIdToPdf(arc.ilabel) + 1; - oarc.olabel = tmodel.TransitionIdToPdf(arc.ilabel) + 1; + if (arc.ilabel == 0) + oarc.ilabel = 0; // epsilon arc + else + oarc.ilabel = tmodel.TransitionIdToPdf(arc.ilabel) + 1; // pdf + 1 + oarc.olabel = oarc.ilabel; oarc.nextstate = arc.nextstate; ofst->AddArc(s, oarc); } @@ -84,11 +88,13 @@ static bool ProcessFile(const GeneralMatrix &feats, int32 ivector_period, const Posterior &pdf_post, BaseFloat min_post, + const VectorBase *deriv_weights, + int32 supervision_length_tolerance, const std::string &utt_id, bool compress, int32 num_pdfs, UtteranceSplitter *utt_splitter, - NnetExampleWriter *example_writer) { + NnetChainExampleWriter *example_writer) { //KALDI_ASSERT(supervision.num_sequences == 1); int32 num_input_frames = feats.NumRows(); int32 num_output_frames = pdf_post.size(); @@ -109,6 +115,14 @@ static bool ProcessFile(const GeneralMatrix &feats, int32 frame_subsampling_factor = utt_splitter->Config().frame_subsampling_factor; + if (deriv_weights && (std::abs(deriv_weights->Dim() - num_output_frames) + > supervision_length_tolerance)) { + KALDI_WARN << "For utterance " << utt_id + << ", mismatch between deriv-weights dim and num-output-frames" + << "; " << deriv_weights->Dim() << " vs " << num_output_frames; + return false; + } + for (size_t c = 0; c < chunks.size(); c++) { ChunkTimeInfo &chunk = chunks[c]; @@ -120,9 +134,9 @@ static bool ProcessFile(const GeneralMatrix &feats, ExtractRowRangeWithPadding(feats, start_frame, tot_input_frames, &input_frames); - NnetExample eg; + NnetChainExample eg; // call the regular input "input". - eg.io.push_back(NnetIo("input", -chunk.left_context, input_frames)); + eg.inputs.push_back(NnetIo("input", -chunk.left_context, input_frames)); if (ivector_feats != NULL) { // if applicable, add the iVector feature. @@ -136,7 +150,7 @@ static bool ProcessFile(const GeneralMatrix &feats, ivector_frame_subsampled = ivector_feats->NumRows() - 1; Matrix ivector(1, ivector_feats->NumCols()); ivector.Row(0).CopyFromVec(ivector_feats->Row(ivector_frame_subsampled)); - eg.io.push_back(NnetIo("ivector", 0, ivector)); + eg.inputs.push_back(NnetIo("ivector", 0, ivector)); } // Note: chunk.first_frame and chunk.num_frames will both be @@ -152,22 +166,34 @@ static bool ProcessFile(const GeneralMatrix &feats, if (t < pdf_post.size()) { for (int32 j = 0; j < pdf_post[t].size(); j++) { BaseFloat post = pdf_post[t][j].second; + KALDI_ASSERT(pdf_post[t][j].first > 0); if (post > min_post) { labels[i].push_back(std::make_pair( - pdf_post[t][j].first - 1, post)); + pdf_post[t][j].first - 1, post)); // Convert from 1-index to 0-index } } } - for (std::vector >::iterator - iter = labels[i].begin(); iter != labels[i].end(); ++iter) - iter->second *= chunk.output_weights[i]; } SubVector output_weights( &(chunk.output_weights[0]), static_cast(chunk.output_weights.size())); + KALDI_ASSERT(output_weights.Dim() == num_frames_subsampled); - eg.io.push_back(NnetIo("output", num_pdfs, 0, labels)); + chain::Supervision supervision(num_pdfs, labels); + if (!deriv_weights) { + eg.outputs.push_back(NnetChainSupervision("output", supervision, output_weights, + 0, frame_subsampling_factor)); + } else { + Vector this_deriv_weights(num_frames_subsampled); + for (int32 i = 0; i < num_frames_subsampled; i++) { + int32 t = i + start_frame_subsampled; + if (t < deriv_weights->Dim()) + this_deriv_weights(i) = (*deriv_weights)(t); + } + eg.outputs.push_back(NnetChainSupervision("output", supervision, this_deriv_weights, + 0, frame_subsampling_factor)); + } if (compress) eg.Compress(); @@ -215,7 +241,8 @@ int main(int argc, char *argv[]) { "ark:lat.1.ark ark:cegs.1.ark"; bool compress = true; - int32 length_tolerance = 100, online_ivector_period = 1; + int32 length_tolerance = 100, online_ivector_period = 1, + supervision_length_tolerance = 1; ExampleGenerationConfig eg_config; // controls num-frames, // left/right-context, etc. @@ -223,7 +250,7 @@ int main(int argc, char *argv[]) { int32 srand_seed = 0; std::string online_ivector_rspecifier, deriv_weights_rspecifier; - BaseFloat min_post = 1e-8, normalization_scale = 0.5; + BaseFloat min_post = 1e-8, lm_scale = 0.5, acoustic_scale = 1.0; ParseOptions po(usage); po.Register("compress", &compress, "If true, write egs with input features " @@ -242,10 +269,15 @@ int main(int argc, char *argv[]) { po.Register("srand", &srand_seed, "Seed for random number generator "); po.Register("length-tolerance", &length_tolerance, "Tolerance for " "difference in num-frames between feat and ivector matrices"); + po.Register("supervision-length-tolerance", &supervision_length_tolerance, "Tolerance for " + "difference in num-frames-subsampled between supervision and deriv weights"); po.Register("min-post", &min_post, "Minimum posterior to keep; this will " "avoid dumping out all posteriors."); - po.Register("normalization-scale", &normalization_scale, - "Scale normalization FST"); + po.Register("acoustic-scale", &acoustic_scale, + "Scale on the acoustic scores in the lattice"); + po.Register("lm-scale", &lm_scale, + "Scale the LM weights on the lattice and interpolate with " + "1-lm-scale times the normalization FST"); po.Register("deriv-weights-rspecifier", &deriv_weights_rspecifier, "Not implemented"); @@ -291,7 +323,7 @@ int main(int argc, char *argv[]) { ReadFstKaldi(normalization_fst_rxfilename, &normalization_fst); KALDI_ASSERT(normalization_fst.NumStates() > 0); - ApplyProbabilityScale(0.5, &normalization_fst); + ApplyProbabilityScale(1.0 - lm_scale, &normalization_fst); } // Read as GeneralMatrix so we don't need to un-compress and re-compress @@ -300,11 +332,13 @@ int main(int argc, char *argv[]) { //chain::RandomAccessSupervisionReader supervision_reader( // supervision_rspecifier); RandomAccessLatticeReader lattice_reader(lattice_rspecifier); - NnetExampleWriter example_writer(examples_wspecifier); + NnetChainExampleWriter example_writer(examples_wspecifier); RandomAccessBaseFloatMatrixReader online_ivector_reader( online_ivector_rspecifier); + RandomAccessBaseFloatVectorReader deriv_weights_reader( + deriv_weights_rspecifier); - int32 num_err = 0; + int32 num_err = 0, num_done = 0; for (; !feat_reader.Done(); feat_reader.Next()) { std::string key = feat_reader.Key(); @@ -339,7 +373,7 @@ int main(int argc, char *argv[]) { } fst::StdVectorFst sup_fst; - fst::ScaleLattice(fst::GraphLatticeScale(0.5), &lat); + fst::ScaleLattice(fst::LatticeScale(lm_scale, acoustic_scale), &lat); ConvertLatticeToPdfLabels(tmodel, lat, &sup_fst); if (normalization_fst.NumStates() > 0 && @@ -352,18 +386,42 @@ int main(int argc, char *argv[]) { // Convert fst to lattice to extract posterior using forward backward. Lattice sup_lat; ConvertFstToLattice(sup_fst, &sup_lat); + + kaldi::uint64 props = sup_lat.Properties(fst::kFstProperties, false); + if (!(props & fst::kTopSorted)) { + if (fst::TopSort(&sup_lat) == false) + KALDI_ERR << "Cycles detected in lattice."; + } + Posterior pdf_post; LatticeForwardBackward(sup_lat, &pdf_post); + const Vector *deriv_weights = NULL; + if (!deriv_weights_rspecifier.empty()) { + if (!deriv_weights_reader.HasKey(key)) { + KALDI_WARN << "No deriv weights for utterance " << key; + num_err++; + continue; + } else { + // this address will be valid until we call HasKey() or Value() + // again. + deriv_weights = &(deriv_weights_reader.Value(key)); + } + } + if (!ProcessFile(feats, online_ivector_feats, online_ivector_period, - pdf_post, min_post, key, compress, tmodel.NumPdfs(), + pdf_post, min_post, deriv_weights, supervision_length_tolerance, + key, compress, tmodel.NumPdfs(), &utt_splitter, &example_writer)) num_err++; + num_done++; } } + if (num_err > 0) - KALDI_WARN << num_err << " utterances had errors and could " - "not be processed."; + KALDI_WARN << "Processed " << num_done << " utterances; " + << num_err << " utterances had errors and could " + "not be processed."; // utt_splitter prints stats in its destructor. return utt_splitter.ExitStatus(); } catch(const std::exception &e) { diff --git a/src/chainbin/nnet3-chain-get-egs.cc b/src/chainbin/nnet3-chain-get-egs.cc index 2448324c4ae..7f3aba38b59 100644 --- a/src/chainbin/nnet3-chain-get-egs.cc +++ b/src/chainbin/nnet3-chain-get-egs.cc @@ -111,6 +111,7 @@ static bool ProcessFile(const fst::StdVectorFst &normalization_fst, SubVector output_weights( &(chunk.output_weights[0]), static_cast(chunk.output_weights.size())); + KALDI_ASSERT(output_weights.Dim() == num_frames_subsampled); if (!deriv_weights) { NnetChainSupervision nnet_supervision("output", supervision_part, @@ -125,7 +126,6 @@ static bool ProcessFile(const fst::StdVectorFst &normalization_fst, if (t < deriv_weights->Dim()) this_deriv_weights(i) = (*deriv_weights)(t); } - KALDI_ASSERT(output_weights.Dim() == num_frames_subsampled); this_deriv_weights.MulElements(output_weights); NnetChainSupervision nnet_supervision("output", supervision_part, this_deriv_weights, diff --git a/src/latbin/lattice-determinize-pruned.cc b/src/latbin/lattice-determinize-pruned.cc index 393d98059f5..90cf264d2dc 100644 --- a/src/latbin/lattice-determinize-pruned.cc +++ b/src/latbin/lattice-determinize-pruned.cc @@ -97,7 +97,8 @@ int main(int argc, char *argv[]) { // Compute a map from each (t, tid) to (sum_of_acoustic_scores, count) unordered_map, std::pair, PairHasher > acoustic_scores; - ComputeAcousticScoresMap(lat, &acoustic_scores); + if (!write_compact) + ComputeAcousticScoresMap(lat, &acoustic_scores); Invert(&lat); // so word labels are on the input side. lat_reader.FreeCurrent(); diff --git a/src/matrix/sparse-matrix.cc b/src/matrix/sparse-matrix.cc index 5ad7f2bfeca..d42953b77c5 100644 --- a/src/matrix/sparse-matrix.cc +++ b/src/matrix/sparse-matrix.cc @@ -666,7 +666,7 @@ void SparseMatrix::AppendSparseMatrixRows( for (; input_iter != input_end; ++input_iter) { num_rows += input_iter->rows_.size(); if (sort_by_t) - if (input_iter->rows_.size() == local_row_size) + if (input_iter->rows_.size() != local_row_size) KALDI_ERR << "we can not append sparse matrices with inconsistent " << " number of rows, if sort_by_t is true"; } @@ -682,8 +682,7 @@ void SparseMatrix::AppendSparseMatrixRows( typename std::vector >::iterator input_row_iter = input_iter->rows_.begin(), input_row_end = input_iter->rows_.end(); - t = 0; - for (; input_row_iter != input_row_end; ++input_row_iter, ++t) { + for (t = 0; input_row_iter != input_row_end; ++input_row_iter, ++t) { int32 src_row_index = n + t * num_inputs; rows_[src_row_index].Swap(&(*input_row_iter)); } @@ -973,7 +972,9 @@ void AppendGeneralMatrixRows(const std::vector &src, << num_cols << " vs. " << src_cols; } } - Matrix appended_mat(tot_rows, num_cols, kUndefined); + Matrix appended_mat(tot_rows, num_cols); + Matrix appended_mat_check(tot_rows, num_cols, kUndefined); + int32 row_offset = 0; if (sort_by_t) { // reorder the src mat rows to be inserted in appended matrix, in order to @@ -987,14 +988,16 @@ void AppendGeneralMatrixRows(const std::vector &src, if (src_rows != local_row_size) KALDI_ERR << "Appending rows of matrices with inconsistent num-rows " << "with sort-by-t=true is not possible:"; - std::vector reorder_indexes(local_row_size, - static_cast(NULL)); - for (int32 j = 0; j < src_rows; j++) { + std::vector reorder_indexes(local_row_size); + for (int32 j = 0; j < local_row_size; j++) { reorder_indexes[j] = j * size + i; + appended_mat_check.Row(j * size + i).CopyFromVec(full_src_mat.Row(j)); } full_src_mat.AddToRows(1.0, &(reorder_indexes[0]), &appended_mat); row_offset += src_rows; } + + KALDI_ASSERT(appended_mat.ApproxEqual(appended_mat_check)); } else { for (int32 i = 0; i < size; i++) { const GeneralMatrix &src_mat = *(src[i]); diff --git a/src/nnet3/nnet-chain-diagnostics.cc b/src/nnet3/nnet-chain-diagnostics.cc index a31a071bada..407a6657305 100644 --- a/src/nnet3/nnet-chain-diagnostics.cc +++ b/src/nnet3/nnet-chain-diagnostics.cc @@ -221,23 +221,31 @@ void NnetChainComputeProb::ProcessOutputs(const NnetChainExample &eg, BaseFloat tot_like, tot_mmi_objf, tot_l2_term, tot_weight; - if (chain_config_.use_smbr_objective) - ComputeChainSmbrObjfAndDeriv( - chain_config_, den_graph_, - sup.supervision, nnet_output, - &tot_like, &tot_mmi_objf, &tot_l2_term, &tot_weight, - (nnet_config_.compute_deriv ? &nnet_output_deriv : - NULL), (use_xent ? &xent_deriv : NULL), - sil_indices_.Dim() ? &sil_indices_ : NULL); - else - ComputeChainObjfAndDeriv(chain_config_, den_graph_, - sup.supervision, nnet_output, - &tot_like, &tot_l2_term, &tot_weight, - (nnet_config_.compute_deriv ? &nnet_output_deriv : - NULL), (use_xent ? &xent_deriv : NULL)); - + if (sup.supervision.numerator_post_targets.NumRows() > 0) { + ComputeKLObjfAndDeriv(chain_config_, den_graph_, + sup.supervision, nnet_output, + &tot_like, &tot_l2_term, &tot_weight, + (nnet_config_.compute_deriv ? &nnet_output_deriv : + NULL), (use_xent ? &xent_deriv : NULL)); + } else { + if (chain_config_.use_smbr_objective) + ComputeChainSmbrObjfAndDeriv( + chain_config_, den_graph_, + sup.supervision, nnet_output, + &tot_like, &tot_mmi_objf, &tot_l2_term, &tot_weight, + (nnet_config_.compute_deriv ? &nnet_output_deriv : + NULL), (use_xent ? &xent_deriv : NULL), + sil_indices_.Dim() ? &sil_indices_ : NULL); + else + ComputeChainObjfAndDeriv(chain_config_, den_graph_, + sup.supervision, nnet_output, + &tot_like, &tot_l2_term, &tot_weight, + (nnet_config_.compute_deriv ? &nnet_output_deriv : + NULL), (use_xent ? &xent_deriv : NULL)); + } + BaseFloat objf_scale = 1.0; - { + { unordered_map::iterator it = objective_scales_.find(sup.name); @@ -305,7 +313,7 @@ void NnetChainComputeProb::ProcessOutputs(const NnetChainExample &eg, xent_objf *= it->second; xent_deriv.Scale(it->second); } - + xent_totals.tot_weight += tot_weight; xent_totals.tot_like += xent_objf; } @@ -313,6 +321,7 @@ void NnetChainComputeProb::ProcessOutputs(const NnetChainExample &eg, } } +/* void NnetChainComputeProb::Compute(const NnetExample &eg) { bool need_model_derivative = nnet_config_.compute_deriv, store_component_stats = nnet_config_.store_component_stats; @@ -368,7 +377,7 @@ void NnetChainComputeProb::ProcessOutputs(const NnetExample &eg, KALDI_ASSERT(io.features.NumRows() % num_sequences == 0); int32 frames_per_sequence = io.features.NumRows() / num_sequences; ComputeKLObjfAndDeriv(chain_config_, den_graph_, - io.features, nnet_output, + io.features, 1.0, nnet_output, num_sequences, frames_per_sequence, &tot_like, &tot_l2_term, &tot_weight, (nnet_config_.compute_deriv ? &nnet_output_deriv : @@ -443,6 +452,7 @@ void NnetChainComputeProb::ProcessOutputs(const NnetExample &eg, num_minibatches_processed_++; } } +*/ bool NnetChainComputeProb::PrintTotalStats() const { bool ans = false; @@ -554,6 +564,7 @@ void RecomputeStats(const std::vector &egs, KALDI_LOG << "Done recomputing stats."; } +/* void RecomputeStats(const std::vector &egs, const chain::ChainTrainingOptions &chain_config_in, const fst::StdVectorFst &den_fst, @@ -578,7 +589,7 @@ void RecomputeStats(const std::vector &egs, prob_computer.PrintTotalStats(); KALDI_LOG << "Done recomputing stats."; } - +*/ } // namespace nnet3 diff --git a/src/nnet3/nnet-chain-diagnostics.h b/src/nnet3/nnet-chain-diagnostics.h index 58f84131bd2..ee7ccddf19b 100644 --- a/src/nnet3/nnet-chain-diagnostics.h +++ b/src/nnet3/nnet-chain-diagnostics.h @@ -85,7 +85,7 @@ class NnetChainComputeProb { void Compute(const NnetChainExample &chain_eg); // compute objective on one minibatch. - void Compute(const NnetExample &eg); + // void Compute(const NnetExample &eg); // Prints out the final stats, and return true if there was a nonzero count. bool PrintTotalStats() const; @@ -106,8 +106,8 @@ class NnetChainComputeProb { void ProcessOutputs(const NnetChainExample &chain_eg, NnetComputer *computer); - void ProcessOutputs(const NnetExample &chain_eg, - NnetComputer *computer); + // void ProcessOutputs(const NnetExample &chain_eg, + // NnetComputer *computer); NnetComputeProbOptions nnet_config_; chain::ChainTrainingOptions chain_config_; @@ -134,10 +134,10 @@ void RecomputeStats(const std::vector &egs, const fst::StdVectorFst &den_fst, Nnet *nnet); -void RecomputeStats(const std::vector &egs, - const chain::ChainTrainingOptions &chain_config, - const fst::StdVectorFst &den_fst, - Nnet *nnet); +//void RecomputeStats(const std::vector &egs, +// const chain::ChainTrainingOptions &chain_config, +// const fst::StdVectorFst &den_fst, +// Nnet *nnet); } // namespace nnet3 diff --git a/src/nnet3/nnet-chain-example.cc b/src/nnet3/nnet-chain-example.cc index 7dbaac04075..c4e7f900c3b 100644 --- a/src/nnet3/nnet-chain-example.cc +++ b/src/nnet3/nnet-chain-example.cc @@ -31,7 +31,7 @@ void NnetChainSupervision::Write(std::ostream &os, bool binary) const { WriteToken(os, binary, name); WriteIndexVector(os, binary, indexes); supervision.Write(os, binary); - WriteToken(os, binary, ""); // for DerivWeights. Want to save space. + WriteToken(os, binary, ""); deriv_weights.Write(os, binary); WriteToken(os, binary, ""); } diff --git a/src/nnet3/nnet-chain-training.cc b/src/nnet3/nnet-chain-training.cc index 8802e6531fe..ebe30d09dd2 100644 --- a/src/nnet3/nnet-chain-training.cc +++ b/src/nnet3/nnet-chain-training.cc @@ -117,6 +117,7 @@ NnetChainTrainer::NnetChainTrainer(const NnetChainTrainingOptions &opts, } } +/* void NnetChainTrainer::Train(const NnetExample &eg) { bool need_model_derivative = true; const NnetTrainerOptions &nnet_config = opts_.nnet_config; @@ -132,6 +133,7 @@ void NnetChainTrainer::Train(const NnetExample &eg) { num_minibatches_processed_++; } +*/ void NnetChainTrainer::Train(const NnetChainExample &chain_eg) { bool need_model_derivative = true; @@ -174,6 +176,9 @@ class ChainTrainerMemoryHolder { ChainTrainerMemoryHolder(const Nnet &nnet, int32 num_den_graph_states, const NnetChainExample &eg); + //ChainTrainerMemoryHolder(const Nnet &nnet, + // int32 num_den_graph_states, + // const NnetExample &eg); private: CuMatrix nnet_output_deriv_; CuMatrix xent_output_deriv_; @@ -238,6 +243,66 @@ ChainTrainerMemoryHolder::ChainTrainerMemoryHolder(const Nnet &nnet, beta_.Resize(2, max_sequence_size, kUndefined); } +/* +ChainTrainerMemoryHolder::ChainTrainerMemoryHolder(const Nnet &nnet, + int32 den_graph_states, + const NnetExample &eg) { + + std::vector::const_iterator iter = eg.io.begin(), + end = eg.io.end(); + + int32 max_rows = 0, + max_cols = 0; + + size_t max_frames_per_sequence = 0, + max_sequence_size = 0, + max_alpha_matrix_size = 0; + + for (; iter != end; ++iter) { + const NnetIo &io = *iter; + int32 node_index = nnet.GetNodeIndex(io.name); + KALDI_ASSERT(node_index >= 0); + if (!nnet.IsOutputNode(node_index)) continue; + + int32 output_rows = io.features.NumRows(); + int32 output_cols = nnet.OutputDim("output"); + + int32 num_sequences = NumSequencesInChainEg(io.indexes); + size_t curr_frames_per_sequence = output_rows / num_sequences + 1; + size_t den_graph_size = den_graph_states + 1; + size_t curr_sequence_size = den_graph_size * num_sequences; + size_t curr_alpha_matrix_size = curr_frames_per_sequence * curr_sequence_size; + + if (curr_alpha_matrix_size > max_alpha_matrix_size) { + max_alpha_matrix_size = curr_alpha_matrix_size; + max_frames_per_sequence = curr_frames_per_sequence; + max_sequence_size = curr_sequence_size; + } + + size_t matrix_size = output_rows * output_cols; + if (matrix_size > (max_rows * max_cols)) { + max_rows = output_rows; + max_cols = output_cols; + } + } + + // the sequence of resizes is in a specific order (bigger to smaller) + // so that the cudaMalloc won't trash the memory it has already + // alloc'd in the previous iterations + alpha_.Resize(max_frames_per_sequence, + max_sequence_size, + kUndefined); + + + nnet_output_deriv_.Resize(max_rows, max_cols, kUndefined); + // note: the same block of memory can be used for xent_output_deriv_ as is + // used for exp_nnet_output_transposed_ in chain-training.cc. + xent_output_deriv_.Resize(max_rows, max_cols, + kUndefined, kStrideEqualNumCols); + + beta_.Resize(2, max_sequence_size, kUndefined); +} + void NnetChainTrainer::TrainInternal(const NnetExample &eg, const NnetComputation &computation) { const NnetTrainerOptions &nnet_config = opts_.nnet_config; @@ -289,6 +354,7 @@ void NnetChainTrainer::TrainInternal(const NnetExample &eg, else ScaleNnet(0.0, delta_nnet_); } +*/ void NnetChainTrainer::TrainInternal(const NnetChainExample &eg, const NnetComputation &computation) { @@ -405,6 +471,7 @@ void NnetChainTrainer::TrainInternalBackstitch(const NnetChainExample &eg, ScaleNnet(0.0, delta_nnet_); } +/* void NnetChainTrainer::ProcessOutputs(bool is_backstitch_step2, const NnetExample &eg, NnetComputer *computer) { @@ -432,7 +499,7 @@ void NnetChainTrainer::ProcessOutputs(bool is_backstitch_step2, KALDI_ASSERT(io.features.NumRows() % num_sequences == 0); int32 frames_per_sequence = io.features.NumRows() / num_sequences; ComputeKLObjfAndDeriv(opts_.chain_config, den_graph_, - io.features, nnet_output, + io.features, 1.0, nnet_output, num_sequences, frames_per_sequence, &tot_objf, &tot_l2_term, &tot_weight, &nnet_output_deriv, @@ -458,8 +525,7 @@ void NnetChainTrainer::ProcessOutputs(bool is_backstitch_step2, xent_name); // at this point, xent_deriv is posteriors derived from the numerato // computation. note, xent_objf has a factor of '.supervision.weight' - CuMatrix cu_post(io.features.GetFullMatrix()); - BaseFloat xent_objf = TraceMatMat(xent_output, cu_post, kTrans); + BaseFloat xent_objf = TraceMatMat(xent_output, xent_deriv, kTrans); { unordered_map::iterator it = @@ -477,8 +543,8 @@ void NnetChainTrainer::ProcessOutputs(bool is_backstitch_step2, tot_weight, xent_objf); } - if (opts_.apply_deriv_weights) { - CuVector cu_deriv_weights; + if (opts_.apply_deriv_weights && io.deriv_weights.Dim() > 0) { + CuVector cu_deriv_weights(io.deriv_weights); nnet_output_deriv.MulRowsVec(cu_deriv_weights); if (use_xent) xent_deriv.MulRowsVec(cu_deriv_weights); @@ -526,6 +592,7 @@ void NnetChainTrainer::ProcessOutputs(bool is_backstitch_step2, } } } +*/ void NnetChainTrainer::ProcessOutputs(bool is_backstitch_step2, const NnetChainExample &eg, @@ -555,24 +622,32 @@ void NnetChainTrainer::ProcessOutputs(bool is_backstitch_step2, BaseFloat tot_objf, tot_mmi_objf, tot_l2_term, tot_weight; - if (opts_.chain_config.use_smbr_objective) { - ComputeChainSmbrObjfAndDeriv(opts_.chain_config, den_graph_, - sup.supervision, nnet_output, - &tot_objf, &tot_mmi_objf, - &tot_l2_term, &tot_weight, - &nnet_output_deriv, - (use_xent ? &xent_deriv : NULL), - sil_indices_.Dim() ? &sil_indices_ : NULL); + if (sup.supervision.numerator_post_targets.NumRows() > 0) { + ComputeKLObjfAndDeriv(opts_.chain_config, den_graph_, + sup.supervision, nnet_output, + &tot_objf, &tot_l2_term, &tot_weight, + &nnet_output_deriv, + (use_xent ? &xent_deriv : NULL)); } else { - ComputeChainObjfAndDeriv(opts_.chain_config, den_graph_, - sup.supervision, nnet_output, - &tot_objf, &tot_l2_term, &tot_weight, - &nnet_output_deriv, - (use_xent ? &xent_deriv : NULL)); + if (opts_.chain_config.use_smbr_objective) { + ComputeChainSmbrObjfAndDeriv(opts_.chain_config, den_graph_, + sup.supervision, nnet_output, + &tot_objf, &tot_mmi_objf, + &tot_l2_term, &tot_weight, + &nnet_output_deriv, + (use_xent ? &xent_deriv : NULL), + sil_indices_.Dim() ? &sil_indices_ : NULL); + } else { + ComputeChainObjfAndDeriv(opts_.chain_config, den_graph_, + sup.supervision, nnet_output, + &tot_objf, &tot_l2_term, &tot_weight, + &nnet_output_deriv, + (use_xent ? &xent_deriv : NULL)); + } } BaseFloat objf_scale = 1.0; - { + { unordered_map::iterator it = objective_scales_.find(sup.name); @@ -634,7 +709,7 @@ void NnetChainTrainer::ProcessOutputs(bool is_backstitch_step2, aux_objf_scales.push_back(objf_scale * opts_.chain_config.mmi_factor); } - ObjectiveFunctionInfo totals(objf_scale, aux_objf_scales); + ObjectiveFunctionInfo totals(this_objf_scale, aux_objf_scales); it = objf_info_.insert(it, std::make_pair(sup.name + suffix, totals)); } diff --git a/src/nnet3/nnet-chain-training.h b/src/nnet3/nnet-chain-training.h index a8f388c6b19..5ec058f8b52 100644 --- a/src/nnet3/nnet-chain-training.h +++ b/src/nnet3/nnet-chain-training.h @@ -66,7 +66,7 @@ class NnetChainTrainer { void Train(const NnetChainExample &eg); // train on one minibatch using NnetExample - void Train(const NnetExample &eg); + // void Train(const NnetExample &eg); // Prints out the final stats, and return true if there was a nonzero count. bool PrintTotalStats() const; @@ -82,8 +82,8 @@ class NnetChainTrainer { const NnetComputation &computation); // The internal function for doing one step of conventional SGD training. - void TrainInternal(const NnetExample &eg, - const NnetComputation &computation); + // void TrainInternal(const NnetExample &eg, + // const NnetComputation &computation); // The internal function for doing one step of backstitch training. Depending // on whether is_backstitch_step1 is true, It could be either the first @@ -95,8 +95,8 @@ class NnetChainTrainer { void ProcessOutputs(bool is_backstitch_step2, const NnetChainExample &eg, NnetComputer *computer); - void ProcessOutputs(bool is_backstitch_step2, const NnetExample &eg, - NnetComputer *computer); + // void ProcessOutputs(bool is_backstitch_step2, const NnetExample &eg, + // NnetComputer *computer); const NnetChainTrainingOptions opts_; diff --git a/src/nnet3/nnet-diagnostics.cc b/src/nnet3/nnet-diagnostics.cc index 8f9f1be24e4..8bf9852bcf7 100644 --- a/src/nnet3/nnet-diagnostics.cc +++ b/src/nnet3/nnet-diagnostics.cc @@ -125,6 +125,7 @@ void NnetComputeProb::ProcessOutputs(const NnetExample &eg, if (config_.compute_accuracy) { BaseFloat tot_weight, tot_accuracy; PerDimObjectiveInfo &acc_totals = accuracy_info_[io.name]; + Vector tot_weight_vec, tot_objective_vec; if (config_.compute_per_dim_accuracy && acc_totals.tot_objective_vec.Dim() == 0) { @@ -132,14 +133,24 @@ void NnetComputeProb::ProcessOutputs(const NnetExample &eg, acc_totals.tot_weight_vec.Resize(output.NumCols()); } + if (config_.compute_per_dim_accuracy) { + tot_objective_vec.Resize(output.NumCols()); + tot_weight_vec.Resize(output.NumCols()); + } + ComputeAccuracy(io.features, output, &tot_weight, &tot_accuracy, config_.compute_per_dim_accuracy ? - &acc_totals.tot_weight_vec : NULL, + &tot_weight_vec : NULL, config_.compute_per_dim_accuracy ? - &acc_totals.tot_objective_vec : NULL); + &tot_objective_vec : NULL); acc_totals.tot_weight += tot_weight; acc_totals.tot_objective += tot_accuracy; + + if (config_.compute_per_dim_accuracy) { + acc_totals.tot_objective_vec.AddVec(1.0, tot_objective_vec); + acc_totals.tot_weight_vec.AddVec(1.0, tot_weight_vec); + } } } } diff --git a/src/nnet3/nnet-example-utils.cc b/src/nnet3/nnet-example-utils.cc index 197eefdf424..4ea715164a6 100644 --- a/src/nnet3/nnet-example-utils.cc +++ b/src/nnet3/nnet-example-utils.cc @@ -99,6 +99,8 @@ static void MergeIo(const std::vector &src, // The features in the different NnetIo in the Indexes across all examples std::vector > output_lists(num_feats); + std::vector const*> > deriv_weights_lists(num_feats); + // Initialize the merged_eg merged_eg->io.clear(); merged_eg->io.resize(num_feats); @@ -131,6 +133,14 @@ static void MergeIo(const std::vector &src, // Add f'th Io's features output_lists[f].push_back(&(io.features)); + if (io.deriv_weights.Dim() != 0 && + merged_eg->io[f].deriv_weights.Dim() == 0) { + merged_eg->io[f].deriv_weights.Resize(sizes[f], kUndefined); + } + + if (merged_eg->io[f].deriv_weights.Dim() != 0) + deriv_weights_lists[f].push_back(&(io.deriv_weights)); + // Work on the Indexes for the f^th Io in merged_eg NnetIo &output_io = merged_eg->io[f]; std::copy(io.indexes.begin(), io.indexes.end(), @@ -148,23 +158,52 @@ static void MergeIo(const std::vector &src, this_offset += this_size; // note: this_offset is a reference. } } - // If sort_by_t is true, the indexes is rearranged to be sorted - // first by 't' and next by 'n'. - for (int32 f = 0; f < num_feats; f++) { - NnetIo output_io = merged_eg->io[f]; - if (sort_by_t) - if (output_io.name == "output") - std::sort(output_io.indexes.begin(), output_io.indexes.end()); - } KALDI_ASSERT(cur_size == sizes); for (int32 f = 0; f < num_feats; f++) { + NnetIo &output_io = merged_eg->io[f]; + AppendGeneralMatrixRows(output_lists[f], - &(merged_eg->io[f].features), - sort_by_t); + &(output_io.features), + output_io.name == "output" ? sort_by_t : false); + if (compress) { // the following won't do anything if the features were sparse. - merged_eg->io[f].features.Compress(); + output_io.features.Compress(); + } + + if (output_io.name != "output") continue; + + if (sort_by_t) + std::sort(output_io.indexes.begin(), output_io.indexes.end()); + + if (output_io.deriv_weights.Dim() != 0) { + // merge the deriv_weights. + int32 num_inputs = deriv_weights_lists[f].size(); + KALDI_ASSERT(num_inputs > 0 + && deriv_weights_lists[f][0]->Dim() != 0); + int32 frames_per_sequence = deriv_weights_lists[f][0]->Dim(); + + if (output_io.deriv_weights.Dim() != frames_per_sequence * num_inputs) + KALDI_ERR << output_io.deriv_weights.Dim() + << " != " << frames_per_sequence << " * " << num_inputs; + + for (int32 n = 0; n < num_inputs; n++) { + const Vector &src_deriv_weights = *(deriv_weights_lists[f][n]); + KALDI_ASSERT(src_deriv_weights.Dim() == frames_per_sequence); + + if (sort_by_t) { + // the ordering of the deriv_weights corresponds to the ordering of the + // Indexes, where the time dimension has the greater stride. + for (int32 t = 0; t < frames_per_sequence; t++) { + output_io.deriv_weights(t * num_inputs + n) = src_deriv_weights(t); + } + } else { + for (int32 t = 0; t < frames_per_sequence; t++) { + output_io.deriv_weights(t + n * num_inputs) = src_deriv_weights(t); + } + } + } } } } @@ -211,6 +250,8 @@ void ShiftExampleTimes(int32 t_offset, } } } + + void GetComputationRequest(const Nnet &nnet, const NnetExample &eg, bool need_model_derivative, @@ -244,7 +285,7 @@ void GetComputationRequest(const Nnet &nnet, size_t cur_size = request->outputs.size(); request->outputs.resize(cur_size + 1); IoSpecification &io_spec = request->outputs[cur_size - 1], - io_spec_xent = request->outputs[cur_size]; + &io_spec_xent = request->outputs[cur_size]; // the IoSpecification for the -xent output is the same // as for the regular output, except for its name which has // the -xent suffix (and the has_deriv member may differ). diff --git a/src/nnet3/nnet-example.cc b/src/nnet3/nnet-example.cc index b79a547ffcf..048797be21a 100644 --- a/src/nnet3/nnet-example.cc +++ b/src/nnet3/nnet-example.cc @@ -31,6 +31,10 @@ void NnetIo::Write(std::ostream &os, bool binary) const { WriteToken(os, binary, name); WriteIndexVector(os, binary, indexes); features.Write(os, binary); + if (deriv_weights.Dim() > 0) { + WriteToken(os, binary, ""); + deriv_weights.Write(os, binary); + } WriteToken(os, binary, ""); KALDI_ASSERT(static_cast(features.NumRows()) == indexes.size()); } @@ -40,7 +44,15 @@ void NnetIo::Read(std::istream &is, bool binary) { ReadToken(is, binary, &name); ReadIndexVector(is, binary, &indexes); features.Read(is, binary); - ExpectToken(is, binary, ""); + + std::string token; + ReadToken(is, binary, &token); + if (token != "") { + KALDI_ASSERT(token == ""); + deriv_weights.Read(is, binary); + ReadToken(is, binary, &token); + } + KALDI_ASSERT(token == ""); } bool NnetIo::operator == (const NnetIo &other) const { @@ -49,6 +61,8 @@ bool NnetIo::operator == (const NnetIo &other) const { if (features.NumRows() != other.features.NumRows() || features.NumCols() != other.features.NumCols()) return false; + if (deriv_weights.Dim() > 0 && + !deriv_weights.ApproxEqual(other.deriv_weights)) return false; Matrix this_mat, other_mat; features.GetMatrix(&this_mat); other.features.GetMatrix(&other_mat); @@ -81,6 +95,7 @@ void NnetIo::Swap(NnetIo *other) { name.swap(other->name); indexes.swap(other->indexes); features.Swap(&(other->features)); + deriv_weights.Swap(&(other->deriv_weights)); } NnetIo::NnetIo(const std::string &name, @@ -98,6 +113,22 @@ NnetIo::NnetIo(const std::string &name, indexes[i].t = t_begin + i * t_stride; } +NnetIo::NnetIo(const std::string &name, + int32 dim, + int32 t_begin, + const Posterior &labels, + const VectorBase &deriv_weights, + int32 t_stride): + name(name), deriv_weights(deriv_weights) { + int32 num_rows = labels.size(); + KALDI_ASSERT(num_rows > 0); + SparseMatrix sparse_feats(dim, labels); + features = sparse_feats; + indexes.resize(num_rows); // sets all n,t,x to zeros. + for (int32 i = 0; i < num_rows; i++) + indexes[i].t = t_begin + i * t_stride; + KALDI_ASSERT(num_rows == deriv_weights.Dim()); +} void NnetExample::Write(std::ostream &os, bool binary) const { diff --git a/src/nnet3/nnet-example.h b/src/nnet3/nnet-example.h index d7312d49729..703ab555bfa 100644 --- a/src/nnet3/nnet-example.h +++ b/src/nnet3/nnet-example.h @@ -45,6 +45,8 @@ struct NnetIo { /// a Matrix, or SparseMatrix (a SparseMatrix would be the natural format for posteriors). GeneralMatrix features; + Vector deriv_weights; + /// This constructor creates NnetIo with name "name", indexes with n=0, x=0, /// and t values ranging from t_begin to /// (t_begin + t_stride * feats.NumRows() - 1) with a stride t_stride, and @@ -73,6 +75,13 @@ struct NnetIo { const Posterior &labels, int32 t_stride = 1); + NnetIo(const std::string &name, + int32 dim, + int32 t_begin, + const Posterior &labels, + const VectorBase &deriv_weights, + int32 t_stride = 1); + void Swap(NnetIo *other); NnetIo() { } From 0817ffd2a238e05cfd06743eb011a3b321180b45 Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Tue, 20 Mar 2018 17:40:46 -0400 Subject: [PATCH 137/174] Changes for sequence KL training --- .../multi_condition/run_ivector_common.sh | 99 +-- .../s5b/local/nnet3/prepare_lores_feats.sh | 15 - .../s5b/local/prepare_parallel_train_data.sh | 2 +- egs/ami/s5b/path.sh | 2 + egs/ami/s5b/run.sh | 3 + egs/ami/s5b/run_prepare_shared.sh | 1 + egs/aspire/s5/conf/mfcc_hires.conf | 1 + .../s5/local/fisher_create_test_lang.sh | 31 +- .../generate_uniformly_segmented_data_dir.sh | 4 +- .../s5/local/nnet3/segment_and_decode.sh | 20 +- egs/aspire/s5/local/run_asr_segmentation.sh | 6 +- .../steps/libs/nnet3/train/chain_objf/ts.py | 471 +++++++++++ egs/wsj/s5/steps/nnet3/align_lats.sh | 4 +- egs/wsj/s5/steps/nnet3/chain/get_egs.sh | 11 +- egs/wsj/s5/steps/nnet3/chain/get_egs_split.sh | 7 +- egs/wsj/s5/steps/nnet3/chain/get_egs_ts.sh | 492 +++++++++++ egs/wsj/s5/steps/nnet3/chain/train_ts.py | 761 ++++++++++++++++++ egs/wsj/s5/steps/nnet3/get_egs.sh | 2 +- .../allocate_multilingual_examples.py | 70 +- .../segmentation/detect_speech_activity.sh | 4 +- 20 files changed, 1850 insertions(+), 156 deletions(-) create mode 100644 egs/wsj/s5/steps/libs/nnet3/train/chain_objf/ts.py create mode 100755 egs/wsj/s5/steps/nnet3/chain/get_egs_ts.sh create mode 100755 egs/wsj/s5/steps/nnet3/chain/train_ts.py diff --git a/egs/ami/s5b/local/nnet3/multi_condition/run_ivector_common.sh b/egs/ami/s5b/local/nnet3/multi_condition/run_ivector_common.sh index eb20415e515..69f655d9039 100755 --- a/egs/ami/s5b/local/nnet3/multi_condition/run_ivector_common.sh +++ b/egs/ami/s5b/local/nnet3/multi_condition/run_ivector_common.sh @@ -10,13 +10,8 @@ set -e -o pipefail stage=1 mic=ihm nj=30 -min_seg_len=1.55 # min length in seconds... we do this because chain training - # will discard segments shorter than 1.5 seconds. Must remain in sync with - # the same option given to prepare_lores_feats.sh. train_set=train_cleaned # you might set this to e.g. train_cleaned. -gmm=tri3_cleaned # This specifies a GMM-dir from the features of the type you're training the system on; - # it should contain alignments for 'train_set'. - +norvb_datadir=data/ihm/train_cleaned_sp num_threads_ubm=32 rvb_affix=_rvb @@ -30,10 +25,7 @@ num_data_reps=1 nnet3_affix=${nnet3_affix}$rvb_affix -gmmdir=exp/${mic}/${gmm} - - -for f in data/${mic}/${train_set}/feats.scp ${gmmdir}/final.mdl; do +for f in data/${mic}/${train_set}/feats.scp; do if [ ! -f $f ]; then echo "$0: expected file $f to exist" exit 1 @@ -79,29 +71,16 @@ if [ $stage -le 1 ]; then done fi -if [ $stage -le 2 ]; then - echo "$0: combining short segments of speed-perturbed high-resolution MFCC training data" - # we have to combine short segments or we won't be able to train chain models - # on those segments. - utils/data/combine_short_segments.sh \ - data/${mic}/${train_set}_sp_hires $min_seg_len data/${mic}/${train_set}_sp_hires_comb - - # just copy over the CMVN to avoid having to recompute it. - cp data/${mic}/${train_set}_sp_hires/cmvn.scp data/${mic}/${train_set}_sp_hires_comb/ - utils/fix_data_dir.sh data/${mic}/${train_set}_sp_hires_comb/ -fi if [ $stage -le 3 ]; then echo "$0: creating reverberated MFCC features" - datadir=data/ihm/train_cleaned_sp - - mfccdir=${datadir}_rvb${num_data_reps}_hires/data + mfccdir=${norvb_datadir}_rvb${num_data_reps}_hires/data if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then utils/create_split_dir.pl /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$mic-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage fi - if [ ! -f ${datadir}_rvb${num_data_reps}_hires/feats.scp ]; then + if [ ! -f ${norvb_datadir}_rvb${num_data_reps}_hires/feats.scp ]; then if [ ! -d "RIRS_NOISES" ]; then # Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip @@ -124,59 +103,28 @@ if [ $stage -le 3 ]; then --num-replications ${num_data_reps} \ --max-noises-per-minute 1 \ --source-sampling-rate 16000 \ - ${datadir} ${datadir}_rvb${num_data_reps} + ${norvb_datadir} ${norvb_datadir}_rvb${num_data_reps} - utils/copy_data_dir.sh ${datadir}_rvb${num_data_reps} ${datadir}_rvb${num_data_reps}_hires - utils/data/perturb_data_dir_volume.sh ${datadir}_rvb${num_data_reps}_hires + utils/copy_data_dir.sh ${norvb_datadir}_rvb${num_data_reps} ${norvb_datadir}_rvb${num_data_reps}_hires + utils/data/perturb_data_dir_volume.sh ${norvb_datadir}_rvb${num_data_reps}_hires steps/make_mfcc.sh --nj $nj --mfcc-config conf/mfcc_hires.conf \ - --cmd "$train_cmd" ${datadir}_rvb${num_data_reps}_hires - steps/compute_cmvn_stats.sh ${datadir}_rvb${num_data_reps}_hires - utils/fix_data_dir.sh ${datadir}_rvb${num_data_reps}_hires - - utils/data/combine_short_segments.sh \ - ${datadir}_rvb${num_data_reps}_hires $min_seg_len ${datadir}_rvb${num_data_reps}_hires_comb - - # just copy over the CMVN to avoid having to recompute it. - cp ${datadir}_rvb${num_data_reps}_hires/cmvn.scp ${datadir}_rvb${num_data_reps}_hires_comb/ - utils/fix_data_dir.sh ${datadir}_rvb${num_data_reps}_hires_comb/ + --cmd "$train_cmd" ${norvb_datadir}_rvb${num_data_reps}_hires + steps/compute_cmvn_stats.sh ${norvb_datadir}_rvb${num_data_reps}_hires + utils/fix_data_dir.sh ${norvb_datadir}_rvb${num_data_reps}_hires fi - utils/combine_data.sh data/${mic}/${train_set}_sp_rvb_hires data/${mic}/${train_set}_sp_hires ${datadir}_rvb${num_data_reps}_hires - utils/combine_data.sh data/${mic}/${train_set}_sp_rvb_hires_comb data/${mic}/${train_set}_sp_hires_comb ${datadir}_rvb${num_data_reps}_hires_comb + utils/combine_data.sh data/${mic}/${train_set}_sp_rvb_hires data/${mic}/${train_set}_sp_hires ${norvb_datadir}_rvb${num_data_reps}_hires fi +exit 1 if [ $stage -le 4 ]; then - echo "$0: selecting segments of hires training data that were also present in the" - echo " ... original training data." - - # note, these data-dirs are temporary; we put them in a sub-directory - # of the place where we'll make the alignments. - temp_data_root=exp/$mic/nnet3${nnet3_affix}/tri5 - mkdir -p $temp_data_root - - utils/data/subset_data_dir.sh --utt-list data/${mic}/${train_set}/feats.scp \ - data/${mic}/${train_set}_sp_hires $temp_data_root/${train_set}_hires - - # note: essentially all the original segments should be in the hires data. - n1=$(wc -l data/$mic/t utils/fix_data_dir.sh data/$mic/train_ihmdata -rm $tmpdir/ihmutt2utt +#rm $tmpdir/ihmutt2utt exit 0; diff --git a/egs/ami/s5b/path.sh b/egs/ami/s5b/path.sh index ad2c93b309b..96eb8328ffc 100644 --- a/egs/ami/s5b/path.sh +++ b/egs/ami/s5b/path.sh @@ -11,3 +11,5 @@ BEAMFORMIT=$KALDI_ROOT/tools/BeamformIt export PATH=$PATH:$LMBIN:$BEAMFORMIT:$SRILM +. /etc/profile.d/modules.sh +module load shared cuda80/toolkit diff --git a/egs/ami/s5b/run.sh b/egs/ami/s5b/run.sh index eacc69a6845..f7044826e6a 100755 --- a/egs/ami/s5b/run.sh +++ b/egs/ami/s5b/run.sh @@ -31,6 +31,7 @@ case $(hostname -d) in fit.vutbr.cz) AMI_DIR=/mnt/matylda5/iveselyk/KALDI_AMI_WAV ;; # BUT, clsp.jhu.edu) AMI_DIR=/export/corpora4/ami/amicorpus ;; # JHU, cstr.ed.ac.uk) AMI_DIR= ;; # Edinburgh, + cm.gemini) AMI_DIR=/export/common/data/corpora/amicorpus;; # COE esac [ ! -r data/local/lm/final_lm ] && echo "Please, run 'run_prepare_shared.sh' first!" && exit 1 @@ -163,6 +164,8 @@ if [ $stage -le 10 ]; then local/run_cleanup_segmentation.sh --mic $mic fi +exit 0 + if [ $stage -le 11 ]; then ali_opt= [ "$mic" != "ihm" ] && ali_opt="--use-ihm-ali true" diff --git a/egs/ami/s5b/run_prepare_shared.sh b/egs/ami/s5b/run_prepare_shared.sh index 1dc0bf8f20a..f4bfa6ac188 100755 --- a/egs/ami/s5b/run_prepare_shared.sh +++ b/egs/ami/s5b/run_prepare_shared.sh @@ -8,6 +8,7 @@ case $(hostname -d) in fit.vutbr.cz) FISHER_TRANS=/mnt/matylda2/data/FISHER/fe_03_p1_tran ;; # BUT, clsp.jhu.edu) FISHER_TRANS=/export/corpora4/ami/fisher_trans/part1 ;; # JHU, cstr.ed.ac.uk) FISHER_TRANS=`pwd`/eddie_data/lm/data/fisher/part1 ;; # Edinburgh, + cm.gemini) FISHER_TRANS=/export/common/data/corpora/LDC/LDC2004T19_CLSP_format/fe_03_p1_tran/;; # COE *) echo "Please modify the script to add your loaction of the Fisher transcripts, or modify this script."; exit 1;; esac # Or select manually, diff --git a/egs/aspire/s5/conf/mfcc_hires.conf b/egs/aspire/s5/conf/mfcc_hires.conf index d870ab04c38..ee9f9efd92a 100755 --- a/egs/aspire/s5/conf/mfcc_hires.conf +++ b/egs/aspire/s5/conf/mfcc_hires.conf @@ -8,3 +8,4 @@ --num-ceps=40 # there is no dimensionality reduction. --low-freq=40 # low cutoff frequency for mel bins --high-freq=-200 # high cutoff frequently, relative to Nyquist of 4000 (=3800) +--allow-downsample=true diff --git a/egs/aspire/s5/local/fisher_create_test_lang.sh b/egs/aspire/s5/local/fisher_create_test_lang.sh index 6739de822aa..27e0c8f081a 100755 --- a/egs/aspire/s5/local/fisher_create_test_lang.sh +++ b/egs/aspire/s5/local/fisher_create_test_lang.sh @@ -3,47 +3,54 @@ if [ -f path.sh ]; then . ./path.sh; fi -mkdir -p data/lang_test - arpa_lm=data/local/lm/3gram-mincount/lm_unpruned.gz +large_lm=data/local/lm/4gram-mincount/lm_unpruned.gz +lang=data/lang +dir=data/lang_test + +. utils/parse_options.sh + +mkdir -p $dir + [ ! -f $arpa_lm ] && echo No such file $arpa_lm && exit 1; -cp -rT data/lang data/lang_test +cp -rT $lang $dir gunzip -c "$arpa_lm" | \ arpa2fst --disambig-symbol=#0 \ - --read-symbol-table=data/lang_test/words.txt - data/lang_test/G.fst + --read-symbol-table=$dir/words.txt - $dir/G.fst echo "Checking how stochastic G is (the first of these numbers should be small):" -fstisstochastic data/lang_test/G.fst +fstisstochastic $dir/G.fst ## Check lexicon. ## just have a look and make sure it seems sane. echo "First few lines of lexicon FST:" -fstprint --isymbols=data/lang/phones.txt --osymbols=data/lang/words.txt data/lang/L.fst | head +fstprint --isymbols=$lang/phones.txt --osymbols=$lang/words.txt $lang/L.fst | head echo Performing further checks # Checking that G.fst is determinizable. -fstdeterminize data/lang_test/G.fst /dev/null || echo Error determinizing G. +fstdeterminize $dir/G.fst /dev/null || echo Error determinizing G. # Checking that L_disambig.fst is determinizable. -fstdeterminize data/lang_test/L_disambig.fst /dev/null || echo Error determinizing L. +fstdeterminize $dir/L_disambig.fst /dev/null || echo Error determinizing L. # Checking that disambiguated lexicon times G is determinizable # Note: we do this with fstdeterminizestar not fstdeterminize, as # fstdeterminize was taking forever (presumbaly relates to a bug # in this version of OpenFst that makes determinization slow for # some case). -fsttablecompose data/lang_test/L_disambig.fst data/lang_test/G.fst | \ +fsttablecompose $dir/L_disambig.fst $dir/G.fst | \ fstdeterminizestar >/dev/null || echo Error # Checking that LG is stochastic: -fsttablecompose data/lang/L_disambig.fst data/lang_test/G.fst | \ +fsttablecompose $lang/L_disambig.fst $dir/G.fst | \ fstisstochastic || echo "[log:] LG is not stochastic" - +if [ ! -z "$large_lm" ]; then utils/build_const_arpa_lm.sh \ - data/local/lm/4gram-mincount/lm_unpruned.gz data/lang data/lang_test_fg + $large_lm $lang ${dir}_fg +fi echo "$0 succeeded" diff --git a/egs/aspire/s5/local/generate_uniformly_segmented_data_dir.sh b/egs/aspire/s5/local/generate_uniformly_segmented_data_dir.sh index c7cc641c858..f696cd14385 100755 --- a/egs/aspire/s5/local/generate_uniformly_segmented_data_dir.sh +++ b/egs/aspire/s5/local/generate_uniformly_segmented_data_dir.sh @@ -31,7 +31,7 @@ fi data_set=$1 segmented_data_set=$2 -if [ "$data_set" == "dev_aspire" ]; then +if [ "$data_set" =~ "dev_aspire" ]; then if [ $stage -le 1 ]; then echo "$0: Creating the data dir with whole recordings without segmentation" # create a whole directory without the segments @@ -58,7 +58,7 @@ if [ $stage -le 2 ]; then utils/validate_data_dir.sh --no-text data/${data_set}_hires fi -segmented_data_set=${data_set}_uniformsegmented_win${window}_over${overlap} +segmented_data_set=${data_set}_uniformsegmented if [ $stage -le 3 ]; then echo "$0: Generating uniform segments with length $window and overlap $overlap." [ -d data/${segmented_data_set}_hires ] && rm -r data/${segmented_data_set}_hires diff --git a/egs/aspire/s5/local/nnet3/segment_and_decode.sh b/egs/aspire/s5/local/nnet3/segment_and_decode.sh index d66b72200c1..e8917d091e2 100755 --- a/egs/aspire/s5/local/nnet3/segment_and_decode.sh +++ b/egs/aspire/s5/local/nnet3/segment_and_decode.sh @@ -109,9 +109,9 @@ fi if [ $stage -le 4 ]; then utils/copy_data_dir.sh $sad_work_dir/${segmented_data_set}_seg \ - data/${segmented_data_set}_hires - steps/compute_cmvn_stats.sh data/${segmented_data_set}_hires - utils/fix_data_dir.sh data/${segmented_data_set}_hires + data/${segmented_data_set}_seg_hires + steps/compute_cmvn_stats.sh data/${segmented_data_set}_seg_hires + utils/fix_data_dir.sh data/${segmented_data_set}_seg_hires fi if [ $stage -le 5 ]; then @@ -122,11 +122,11 @@ if [ $stage -le 5 ]; then # acoustic conditions drift over time within the speaker's data. steps/online/nnet2/extract_ivectors.sh --cmd "$train_cmd" --nj $decode_num_jobs \ --sub-speaker-frames $sub_speaker_frames --max-count $max_count \ - data/${segmented_data_set}_hires $lang $ivector_root_dir/extractor \ - $ivector_root_dir/ivectors_${segmented_data_set} + data/${segmented_data_set}_seg_hires $lang $ivector_root_dir/extractor \ + $ivector_root_dir/ivectors_${segmented_data_set}_seg fi -decode_dir=$dir/decode_${segmented_data_set}${affix}_pp +decode_dir=$dir/decode_${segmented_data_set}_seg${affix}_pp if [ $stage -le 6 ]; then echo "Generating lattices" rm -f ${decode_dir}_tg/.error @@ -138,8 +138,8 @@ if [ $stage -le 6 ]; then --extra-right-context-final $extra_right_context_final \ --frames-per-chunk "$frames_per_chunk" \ --skip-scoring true ${iter:+--iter $iter} --lattice-beam $lattice_beam \ - --online-ivector-dir $ivector_root_dir/ivectors_${segmented_data_set} \ - $graph data/${segmented_data_set}_hires ${decode_dir}_tg || \ + --online-ivector-dir $ivector_root_dir/ivectors_${segmented_data_set}_seg \ + $graph data/${segmented_data_set}_seg_hires ${decode_dir}_tg || \ { echo "$0: Error decoding" && exit 1; } fi @@ -147,7 +147,7 @@ if [ $stage -le 7 ]; then echo "Rescoring lattices" steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ --skip-scoring true \ - ${lang}_pp_test{,_fg} data/${segmented_data_set}_hires \ + ${lang}_pp_test{,_fg} data/${segmented_data_set}_seg_hires \ ${decode_dir}_{tg,fg}; fi @@ -161,5 +161,5 @@ if [ $stage -le 8 ]; then ${iter:+--iter $iter} \ --decode-mbr true \ --tune-hyper true \ - $lang $decode_dir $act_data_set $segmented_data_set $out_file + $lang $decode_dir $act_data_set ${segmented_data_set}_seg $out_file fi diff --git a/egs/aspire/s5/local/run_asr_segmentation.sh b/egs/aspire/s5/local/run_asr_segmentation.sh index ae5dcacc7c1..7a63ae62937 100755 --- a/egs/aspire/s5/local/run_asr_segmentation.sh +++ b/egs/aspire/s5/local/run_asr_segmentation.sh @@ -40,8 +40,8 @@ nstage=-10 train_stage=-10 test_stage=-10 num_data_reps=3 -affix=_1a # For segmentation -test_affix=1a_silscale0.05 +affix=_1a5 # For segmentation +test_affix=1a5 nnet_affix=1a stage=-1 nj=80 @@ -174,6 +174,8 @@ if [ $stage -le 6 ]; then rm -r ${rvb_targets_dirs[@]} fi +exit 0 + sad_nnet_dir=exp/segmentation${affix}/tdnn_stats_asr_sad_$nnet_affix sad_opts="--extra-left-context 79 --extra-right-context 21 --frames-per-chunk 150 --extra-left-context-initial 0 --extra-right-context-final 0 --acwt 0.3" #sad_nnet_dir=exp/segmentation${affix}/tdnn_lstm_asr_sad_$nnet_affix diff --git a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/ts.py b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/ts.py new file mode 100644 index 00000000000..ff416657adf --- /dev/null +++ b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/ts.py @@ -0,0 +1,471 @@ + + +# Copyright 2016 Vijayaditya Peddinti. +# 2016 Vimal Manohar +# Apache 2.0. + +""" This is a module with methods which will be used by scripts for +teacher-student training of deep neural network acoustic model with +sequence KL objective. +""" + +import logging +import math +import os +import sys + +import libs.common as common_lib +import libs.nnet3.train.common as common_train_lib +from . import acoustic_model as chain_lib + +logger = logging.getLogger(__name__) +logger.addHandler(logging.NullHandler()) + + +def generate_chain_egs(dir, data, lat_dir, egs_dir, + left_context, right_context, + run_opts, stage=0, + left_context_initial=-1, right_context_final=-1, + frame_subsampling_factor=3, + online_ivector_dir=None, + frames_per_iter=20000, frames_per_eg_str="20", srand=0, + egs_opts=None, cmvn_opts=None, transform_dir=None): + """Wrapper for steps/nnet3/chain/get_egs_ts.sh + + See options in that script. + """ + + common_lib.execute_command( + """steps/nnet3/chain/get_egs_ts.sh {egs_opts} \ + --cmd "{command}" \ + --cmvn-opts "{cmvn_opts}" \ + --transform-dir "{transform_dir}" \ + --online-ivector-dir "{ivector_dir}" \ + --left-context {left_context} \ + --right-context {right_context} \ + --left-context-initial {left_context_initial} \ + --right-context-final {right_context_final} \ + --frame-subsampling-factor {frame_subsampling_factor} \ + --stage {stage} \ + --frames-per-iter {frames_per_iter} \ + --frames-per-eg {frames_per_eg_str} \ + --srand {srand} \ + {data} {dir} {lat_dir} {egs_dir}""".format( + command=run_opts.egs_command, + cmvn_opts=cmvn_opts if cmvn_opts is not None else '', + transform_dir=(transform_dir + if transform_dir is not None + else ''), + ivector_dir=(online_ivector_dir + if online_ivector_dir is not None + else ''), + left_context=left_context, + right_context=right_context, + left_context_initial=left_context_initial, + right_context_final=right_context_final, + frame_subsampling_factor=frame_subsampling_factor, + stage=stage, frames_per_iter=frames_per_iter, + frames_per_eg_str=frames_per_eg_str, srand=srand, + data=data, lat_dir=lat_dir, dir=dir, egs_dir=egs_dir, + egs_opts=egs_opts if egs_opts is not None else '')) + + +#def train_new_models(dir, iter, srand, num_jobs, +# num_archives_processed, num_archives, +# raw_model_string, egs_dir, +# apply_deriv_weights, +# min_deriv_time, max_deriv_time_relative, +# l2_regularize, xent_regularize, leaky_hmm_coefficient, +# momentum, max_param_change, +# shuffle_buffer_size, num_chunk_per_minibatch_str, +# frame_subsampling_factor, truncate_deriv_weights, run_opts, +# backstitch_training_scale=0.0, backstitch_training_interval=1, +# use_multitask_egs=False, objective_opts=""): +# """ +# Called from train_one_iteration(), this method trains new models +# with 'num_jobs' jobs, and +# writes files like exp/tdnn_a/24.{1,2,3,..}.raw +# +# We cannot easily use a single parallel SGE job to do the main training, +# because the computation of which archive and which --frame option +# to use for each job is a little complex, so we spawn each one separately. +# this is no longer true for RNNs as we use do not use the --frame option +# but we use the same script for consistency with FF-DNN code +# +# use_multitask_egs : True, if different examples used to train multiple +# tasks or outputs, e.g.multilingual training. +# multilingual egs can be generated using get_egs.sh and +# steps/nnet3/multilingual/allocate_multilingual_examples.py, +# those are the top-level scripts. +# """ +# +# deriv_time_opts = [] +# if min_deriv_time is not None: +# deriv_time_opts.append("--optimization.min-deriv-time={0}".format( +# min_deriv_time)) +# if max_deriv_time_relative is not None: +# deriv_time_opts.append("--optimization.max-deriv-time-relative={0}".format( +# int(max_deriv_time_relative))) +# +# threads = [] +# # the GPU timing info is only printed if we use the --verbose=1 flag; this +# # slows down the computation slightly, so don't accumulate it on every +# # iteration. Don't do it on iteration 0 either, because we use a smaller +# # than normal minibatch size, and people may get confused thinking it's +# # slower for iteration 0 because of the verbose option. +# verbose_opt = ("--verbose=1" if iter % 20 == 0 and iter > 0 else "") +# +# for job in range(1, num_jobs+1): +# # k is a zero-based index that we will derive the other indexes from. +# k = num_archives_processed + job - 1 +# # work out the 1-based archive index. +# archive_index = (k % num_archives) + 1 +# # previous : frame_shift = (k/num_archives) % frame_subsampling_factor +# frame_shift = ((archive_index + k/num_archives) +# % frame_subsampling_factor) +# +# multitask_egs_opts = common_train_lib.get_multitask_egs_opts( +# egs_dir, +# egs_prefix="cegs.", +# archive_index=archive_index, +# use_multitask_egs=use_multitask_egs) +# scp_or_ark = "scp" if use_multitask_egs else "ark" +# cache_io_opts = (("--read-cache={dir}/cache.{iter}".format(dir=dir, +# iter=iter) +# if iter > 0 else "") + +# (" --write-cache={0}/cache.{1}".format(dir, iter + 1) +# if job == 1 else "")) +# +# if truncate_deriv_weights > 0: +# raise NotImplementedError +# +# thread = common_lib.background_command( +# """{command} {train_queue_opt} {dir}/log/train.{iter}.{job}.log \ +# nnet3-chain-train-post {parallel_train_opts} {verbose_opt} \ +# --apply-deriv-weights={app_deriv_wts} {objective_opts} \ +# --l2-regularize={l2} --leaky-hmm-coefficient={leaky} \ +# {cache_io_opts} --xent-regularize={xent_reg} \ +# {deriv_time_opts} \ +# --print-interval=10 --momentum={momentum} \ +# --max-param-change={max_param_change} \ +# --backstitch-training-scale={backstitch_training_scale} \ +# --backstitch-training-interval={backstitch_training_interval} \ +# --l2-regularize-factor={l2_regularize_factor} \ +# --srand={srand} \ +# "{raw_model}" {dir}/den.fst \ +# "ark,bg:nnet3-copy-egs {multitask_egs_opts} \ +# --frame-shift={fr_shft} \ +# {scp_or_ark}:{egs_dir}/egs.{archive_index}.{scp_or_ark} ark:- | \ +# nnet3-shuffle-egs --buffer-size={buf_size} \ +# --srand={srand} ark:- ark:- | nnet3-merge-egs --sort-by-t \ +# --minibatch-size={num_chunk_per_mb} ark:- ark:- |" \ +# {dir}/{next_iter}.{job}.raw""".format( +# command=run_opts.command, +# train_queue_opt=run_opts.train_queue_opt, +# dir=dir, iter=iter, srand=iter + srand, +# next_iter=iter + 1, job=job, +# deriv_time_opts=" ".join(deriv_time_opts), +# app_deriv_wts=apply_deriv_weights, +# fr_shft=frame_shift, l2=l2_regularize, +# xent_reg=xent_regularize, leaky=leaky_hmm_coefficient, +# cache_io_opts=cache_io_opts, +# parallel_train_opts=run_opts.parallel_train_opts, +# verbose_opt=verbose_opt, +# momentum=momentum, max_param_change=max_param_change, +# backstitch_training_scale=backstitch_training_scale, +# backstitch_training_interval=backstitch_training_interval, +# l2_regularize_factor=1.0/num_jobs, +# raw_model=raw_model_string, +# egs_dir=egs_dir, archive_index=archive_index, +# buf_size=shuffle_buffer_size, +# num_chunk_per_mb=num_chunk_per_minibatch_str, +# multitask_egs_opts=multitask_egs_opts, +# scp_or_ark=scp_or_ark, +# objective_opts=objective_opts), +# require_zero_status=True) +# +# threads.append(thread) +# +# for thread in threads: +# thread.join() +# +# +#def train_one_iteration(dir, iter, srand, egs_dir, +# num_jobs, num_archives_processed, num_archives, +# learning_rate, shrinkage_value, +# num_chunk_per_minibatch_str, +# apply_deriv_weights, min_deriv_time, +# max_deriv_time_relative, +# l2_regularize, xent_regularize, +# leaky_hmm_coefficient, +# momentum, max_param_change, shuffle_buffer_size, +# frame_subsampling_factor, truncate_deriv_weights, +# run_opts, dropout_edit_string="", +# backstitch_training_scale=0.0, backstitch_training_interval=1, +# use_multitask_egs=False, +# objective_opts=""): +# """ Called from steps/nnet3/chain/train_ts.py for one iteration for +# neural network training with LF-MMI objective +# +# """ +# +# # Set off jobs doing some diagnostics, in the background. +# # Use the egs dir from the previous iteration for the diagnostics +# # check if different iterations use the same random seed +# if os.path.exists('{0}/srand'.format(dir)): +# try: +# saved_srand = int(open('{0}/srand'.format(dir)).readline().strip()) +# except (IOError, ValueError): +# logger.error("Exception while reading the random seed " +# "for training") +# raise +# if srand != saved_srand: +# logger.warning("The random seed provided to this iteration " +# "(srand={0}) is different from the one saved last " +# "time (srand={1}). Using srand={0}.".format( +# srand, saved_srand)) +# else: +# with open('{0}/srand'.format(dir), 'w') as f: +# f.write(str(srand)) +# +# # Sets off some background jobs to compute train and +# # validation set objectives +# compute_train_cv_probabilities( +# dir=dir, iter=iter, egs_dir=egs_dir, +# l2_regularize=l2_regularize, xent_regularize=xent_regularize, +# leaky_hmm_coefficient=leaky_hmm_coefficient, run_opts=run_opts, +# use_multitask_egs=use_multitask_egs, +# objective_opts=objective_opts) +# +# if iter > 0: +# # Runs in the background +# chain_lib.compute_progress(dir, iter, run_opts) +# +# do_average = (iter > 0) +# +# raw_model_string = ("nnet3-am-copy --raw=true --learning-rate={0} " +# "--scale={1} {2}/{3}.mdl - |".format( +# learning_rate, shrinkage_value, dir, iter)) +# +# if do_average: +# cur_num_chunk_per_minibatch_str = num_chunk_per_minibatch_str +# cur_max_param_change = max_param_change +# else: +# # on iteration zero, use a smaller minibatch size (and we will later +# # choose the output of just one of the jobs): the model-averaging isn't +# # always helpful when the model is changing too fast (i.e. it can worsen +# # the objective function), and the smaller minibatch size will help to +# # keep the update stable. +# cur_num_chunk_per_minibatch_str = common_train_lib.halve_minibatch_size_str( +# num_chunk_per_minibatch_str) +# cur_max_param_change = float(max_param_change) / math.sqrt(2) +# +# raw_model_string = raw_model_string + dropout_edit_string +# +# shrink_info_str = '' +# if shrinkage_value != 1.0: +# shrink_info_str = ' and shrink value is {0}'.format(shrinkage_value) +# +# objf_info = "" if objective_opts == "" else ( +# "and objective_opts=" + objective_opts) +# logger.info("On iteration {0}, learning rate is {1}" +# "{shrink_info} {objf_info}.".format( +# iter, learning_rate, +# shrink_info=shrink_info_str, objf_info=objf_info)) +# +# train_new_models(dir=dir, iter=iter, srand=srand, num_jobs=num_jobs, +# num_archives_processed=num_archives_processed, +# num_archives=num_archives, +# raw_model_string=raw_model_string, +# egs_dir=egs_dir, +# apply_deriv_weights=apply_deriv_weights, +# min_deriv_time=min_deriv_time, +# max_deriv_time_relative=max_deriv_time_relative, +# l2_regularize=l2_regularize, +# xent_regularize=xent_regularize, +# leaky_hmm_coefficient=leaky_hmm_coefficient, +# momentum=momentum, +# max_param_change=cur_max_param_change, +# shuffle_buffer_size=shuffle_buffer_size, +# num_chunk_per_minibatch_str=cur_num_chunk_per_minibatch_str, +# frame_subsampling_factor=frame_subsampling_factor, +# truncate_deriv_weights=truncate_deriv_weights, +# run_opts=run_opts, +# # linearly increase backstitch_training_scale during the +# # first few iterations (hard-coded as 15) +# backstitch_training_scale=(backstitch_training_scale * +# iter / 15 if iter < 15 else backstitch_training_scale), +# backstitch_training_interval=backstitch_training_interval, +# use_multitask_egs=use_multitask_egs, +# objective_opts=objective_opts) +# +# [models_to_average, best_model] = common_train_lib.get_successful_models( +# num_jobs, '{0}/log/train.{1}.%.log'.format(dir, iter)) +# nnets_list = [] +# for n in models_to_average: +# nnets_list.append("{0}/{1}.{2}.raw".format(dir, iter + 1, n)) +# +# if do_average: +# # average the output of the different jobs. +# common_train_lib.get_average_nnet_model( +# dir=dir, iter=iter, +# nnets_list=" ".join(nnets_list), +# run_opts=run_opts) +# +# else: +# # choose the best model from different jobs +# common_train_lib.get_best_nnet_model( +# dir=dir, iter=iter, +# best_model_index=best_model, +# run_opts=run_opts) +# +# try: +# for i in range(1, num_jobs + 1): +# os.remove("{0}/{1}.{2}.raw".format(dir, iter + 1, i)) +# except OSError: +# raise Exception("Error while trying to delete the raw models") +# +# new_model = "{0}/{1}.mdl".format(dir, iter + 1) +# +# if not os.path.isfile(new_model): +# raise Exception("Could not find {0}, at the end of " +# "iteration {1}".format(new_model, iter)) +# elif os.stat(new_model).st_size == 0: +# raise Exception("{0} has size 0. Something went wrong in " +# "iteration {1}".format(new_model, iter)) +# if os.path.exists("{0}/cache.{1}".format(dir, iter)): +# os.remove("{0}/cache.{1}".format(dir, iter)) +# +# +#def compute_train_cv_probabilities(dir, iter, egs_dir, l2_regularize, +# xent_regularize, leaky_hmm_coefficient, +# run_opts, +# use_multitask_egs=False, +# objective_opts=""): +# model = '{0}/{1}.mdl'.format(dir, iter) +# scp_or_ark = "scp" if use_multitask_egs else "ark" +# egs_suffix = ".scp" if use_multitask_egs else ".egs" +# +# multitask_egs_opts = common_train_lib.get_multitask_egs_opts( +# egs_dir, +# egs_prefix="valid_diagnostic.", +# use_multitask_egs=use_multitask_egs) +# +# import re +# objective_opts = re.sub(r"--mmi-factor=0.0 ", "--mmi-factor=1e-10 ", +# objective_opts) +# +# common_lib.background_command( +# """{command} {dir}/log/compute_prob_valid.{iter}.log \ +# nnet3-chain-compute-prob-post --l2-regularize={l2} {objective_opts} \ +# --leaky-hmm-coefficient={leaky} --xent-regularize={xent_reg} \ +# {model} {dir}/den.fst \ +# "ark,bg:nnet3-copy-egs {multitask_egs_opts} {scp_or_ark}:{egs_dir}/valid_diagnostic{egs_suffix} \ +# ark:- | nnet3-merge-egs --sort-by-t --minibatch-size=1:64 ark:- ark:- |" \ +# """.format(command=run_opts.command, dir=dir, iter=iter, model=model, +# l2=l2_regularize, leaky=leaky_hmm_coefficient, +# xent_reg=xent_regularize, +# egs_dir=egs_dir, +# multitask_egs_opts=multitask_egs_opts, +# scp_or_ark=scp_or_ark, egs_suffix=egs_suffix, +# objective_opts=objective_opts)) +# +# multitask_egs_opts = common_train_lib.get_multitask_egs_opts( +# egs_dir, +# egs_prefix="train_diagnostic.", +# use_multitask_egs=use_multitask_egs) +# +# common_lib.background_command( +# """{command} {dir}/log/compute_prob_train.{iter}.log \ +# nnet3-chain-compute-prob-post --l2-regularize={l2} {objective_opts} \ +# --leaky-hmm-coefficient={leaky} --xent-regularize={xent_reg} \ +# "nnet3-am-copy --raw=true {model} - |" {dir}/den.fst \ +# "ark,bg:nnet3-copy-egs {multitask_egs_opts} {scp_or_ark}:{egs_dir}/train_diagnostic{egs_suffix} \ +# ark:- | nnet3-merge-egs --sort-by-t --minibatch-size=1:64 ark:- ark:- |" \ +# """.format(command=run_opts.command, dir=dir, iter=iter, model=model, +# l2=l2_regularize, leaky=leaky_hmm_coefficient, +# xent_reg=xent_regularize, +# egs_dir=egs_dir, +# multitask_egs_opts=multitask_egs_opts, +# scp_or_ark=scp_or_ark, egs_suffix=egs_suffix, +# objective_opts=objective_opts)) +# +# +#def combine_models(dir, num_iters, models_to_combine, num_chunk_per_minibatch_str, +# egs_dir, leaky_hmm_coefficient, l2_regularize, +# xent_regularize, run_opts, +# max_objective_evaluations=30, +# use_multitask_egs=False, +# objective_opts=""): +# """ Function to do model combination +# +# In the nnet3 setup, the logic +# for doing averaging of subsets of the models in the case where +# there are too many models to reliably esetimate interpolation +# factors (max_models_combine) is moved into the nnet3-combine. +# """ +# raw_model_strings = [] +# logger.info("Combining {0} models.".format(models_to_combine)) +# +# models_to_combine.add(num_iters) +# +# for iter in sorted(models_to_combine): +# model_file = '{0}/{1}.mdl'.format(dir, iter) +# if os.path.exists(model_file): +# # we used to copy them with nnet3-am-copy --raw=true, but now +# # the raw-model-reading code discards the other stuff itself. +# raw_model_strings.append(model_file) +# else: +# print("{0}: warning: model file {1} does not exist " +# "(final combination)".format(sys.argv[0], model_file)) +# +# scp_or_ark = "scp" if use_multitask_egs else "ark" +# egs_suffix = ".scp" if use_multitask_egs else ".egs" +# +# multitask_egs_opts = common_train_lib.get_multitask_egs_opts( +# egs_dir, +# egs_prefix="combine.", +# use_multitask_egs=use_multitask_egs) +# +# # We reverse the order of the raw model strings so that the freshest one +# # goes first. This is important for systems that include batch +# # normalization-- it means that the freshest batch-norm stats are used. +# # Since the batch-norm stats are not technically parameters, they are not +# # combined in the combination code, they are just obtained from the first +# # model. +# raw_model_strings = list(reversed(raw_model_strings)) +# +# common_lib.execute_command( +# """{command} {combine_queue_opt} {dir}/log/combine.log \ +# nnet3-chain-combine-post {objective_opts} \ +# --max-objective-evaluations={max_objective_evaluations} \ +# --l2-regularize={l2} --leaky-hmm-coefficient={leaky} \ +# --verbose=3 {dir}/den.fst {raw_models} \ +# "ark,bg:nnet3-copy-egs {multitask_egs_opts} {scp_or_ark}:{egs_dir}/combine{egs_suffix} ark:- | \ +# nnet3-merge-egs --sort-by-t --minibatch-size={num_chunk_per_mb} \ +# ark:- ark:- |" - \| \ +# nnet3-am-copy --set-raw-nnet=- {dir}/{num_iters}.mdl \ +# {dir}/final.mdl""".format( +# command=run_opts.command, +# combine_queue_opt=run_opts.combine_queue_opt, +# max_objective_evaluations=max_objective_evaluations, +# l2=l2_regularize, leaky=leaky_hmm_coefficient, +# dir=dir, raw_models=" ".join(raw_model_strings), +# num_chunk_per_mb=num_chunk_per_minibatch_str, +# num_iters=num_iters, +# egs_dir=egs_dir, +# multitask_egs_opts=multitask_egs_opts, +# scp_or_ark=scp_or_ark, egs_suffix=egs_suffix, +# objective_opts=objective_opts)) +# +# # Compute the probability of the final, combined model with +# # the same subset we used for the previous compute_probs, as the +# # different subsets will lead to different probs. +# compute_train_cv_probabilities( +# dir=dir, iter='final', egs_dir=egs_dir, +# l2_regularize=l2_regularize, xent_regularize=xent_regularize, +# leaky_hmm_coefficient=leaky_hmm_coefficient, +# run_opts=run_opts, +# use_multitask_egs=use_multitask_egs, +# objective_opts=objective_opts) + diff --git a/egs/wsj/s5/steps/nnet3/align_lats.sh b/egs/wsj/s5/steps/nnet3/align_lats.sh index 27147909101..9f05379d55f 100755 --- a/egs/wsj/s5/steps/nnet3/align_lats.sh +++ b/egs/wsj/s5/steps/nnet3/align_lats.sh @@ -51,9 +51,9 @@ dir=$4 oov=`cat $lang/oov.int` || exit 1; mkdir -p $dir/log echo $nj > $dir/num_jobs -sdata=$data/split${nj}utt +sdata=$data/split${nj} [[ -d $sdata && $data/feats.scp -ot $sdata ]] || \ - split_data.sh --per-utt $data $nj || exit 1; + split_data.sh $data $nj || exit 1; extra_files= if [ ! -z "$online_ivector_dir" ]; then diff --git a/egs/wsj/s5/steps/nnet3/chain/get_egs.sh b/egs/wsj/s5/steps/nnet3/chain/get_egs.sh index 1341d8f6b77..b56e94078d1 100755 --- a/egs/wsj/s5/steps/nnet3/chain/get_egs.sh +++ b/egs/wsj/s5/steps/nnet3/chain/get_egs.sh @@ -306,8 +306,6 @@ chain_supervision_all_opts="--lattice-input=true --frame-subsampling-factor=$ali [ ! -z $left_tolerance ] && \ chain_supervision_all_opts="$chain_supervision_all_opts --left-tolerance=$left_tolerance" -normalization_scale=1.0 - lats_rspecifier="ark:gunzip -c $latdir/lat.JOB.gz |" if [ ! -z $lattice_prune_beam ]; then if [ "$lattice_prune_beam" == "0" ] || [ "$lattice_prune_beam" == "0.0" ]; then @@ -317,6 +315,8 @@ if [ ! -z $lattice_prune_beam ]; then fi fi +normalization_scale=1.0 + if [ ! -z "$lattice_lm_scale" ]; then chain_supervision_all_opts="$chain_supervision_all_opts --lm-scale=$lattice_lm_scale" @@ -353,8 +353,8 @@ if [ $stage -le 2 ]; then ( $cmd --max-jobs-run 6 JOB=1:$nj $dir/log/lattice_copy.JOB.log \ lattice-copy --include="cat $dir/valid_uttlist $dir/train_subset_uttlist |" --ignore-missing \ - "$lats_rspecifier" \ - ark,scp:$dir/lat_special.JOB.ark,$dir/lat_special.JOB.scp || exit 1 + "$lats_rspecifier" \ + ark,scp:$dir/lat_special.JOB.ark,$dir/lat_special.JOB.scp || exit 1 for id in $(seq $nj); do cat $dir/lat_special.$id.scp; done > $dir/lat_special.scp @@ -432,7 +432,6 @@ if [ $stage -le 4 ]; then # there can be too many small files to deal with, because the total number of # files is the product of 'nj' by 'num_archives_intermediate', which might be # quite large. - $cmd --max-jobs-run $max_jobs_run JOB=1:$nj $dir/log/get_egs.JOB.log \ lattice-align-phones --replace-output-symbols=true $latdir/final.mdl \ "$lats_rspecifier" ark:- \| \ @@ -522,7 +521,7 @@ if [ $stage -le 6 ]; then # 'storage' directory. rm cegs_orig.*.ark 2>/dev/null ) - if [ $archives_multiple -gt 1 ]; then + if ! $generate_egs_scp && [ $archives_multiple -gt 1 ]; then # there are some extra soft links that we should delete. for f in $dir/cegs.*.*.ark; do rm $f; done fi diff --git a/egs/wsj/s5/steps/nnet3/chain/get_egs_split.sh b/egs/wsj/s5/steps/nnet3/chain/get_egs_split.sh index e0fd6b5c01a..d9db66c3153 100755 --- a/egs/wsj/s5/steps/nnet3/chain/get_egs_split.sh +++ b/egs/wsj/s5/steps/nnet3/chain/get_egs_split.sh @@ -335,9 +335,8 @@ if [ $stage -le 2 ]; then ( $cmd --max-jobs-run 6 JOB=1:$nj $dir/log/lattice_copy.JOB.log \ lattice-copy --include="cat $dir/valid_uttlist $dir/train_subset_uttlist |" --ignore-missing \ - --write-compact=false \ - "$lats_rspecifier" \ - ark,scp:$dir/lat_special.JOB.ark,$dir/lat_special.JOB.scp || exit 1 + --write-compact=false "$lats_rspecifier" \ + ark,scp:$dir/lat_special.JOB.ark,$dir/lat_special.JOB.scp || exit 1 for id in $(seq $nj); do cat $dir/lat_special.$id.scp; done > $dir/lat_special.scp @@ -503,7 +502,7 @@ if [ $stage -le 6 ]; then # 'storage' directory. rm cegs_orig.*.ark 2>/dev/null ) - if [ $archives_multiple -gt 1 ]; then + if ! $generate_egs_scp && [ $archives_multiple -gt 1 ]; then # there are some extra soft links that we should delete. for f in $dir/cegs.*.*.ark; do rm $f; done fi diff --git a/egs/wsj/s5/steps/nnet3/chain/get_egs_ts.sh b/egs/wsj/s5/steps/nnet3/chain/get_egs_ts.sh new file mode 100755 index 00000000000..01656d85070 --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/chain/get_egs_ts.sh @@ -0,0 +1,492 @@ +#!/bin/bash + +# Copyright 2012-2015 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. +# +# This script, which will generally be called from other neural-net training +# scripts, extracts the training examples used to train the 'chain' system +# (and also the validation examples used for diagnostics), and puts them in +# separate archives. +# +# This script dumps egs with many frames of labels, controlled by the +# frames_per_eg config variable (default: 25), plus left and right context. +# Because CTC training involves alignment of data, we can't meaningfully train +# frame by frame. The supervision approach involves the time alignment, though-- +# it is just applied in a loose way, where each symbol can appear in the +# frame-range that it was in in the alignment, extended by a certain margin. +# + + +# Begin configuration section. +cmd=run.pl +frames_per_eg=25 # number of feature frames example (not counting added context). + # more->less disk space and less time preparing egs, but more + # I/O during training. +frames_overlap_per_eg=0 # number of supervised frames of overlap that we aim for per eg. + # can be useful to avoid wasted data if you're using --left-deriv-truncate + # and --right-deriv-truncate. +frame_subsampling_factor=3 # frames-per-second of features we train on divided + # by frames-per-second at output of chain model +left_context=4 # amount of left-context per eg (i.e. extra frames of input features + # not present in the output supervision). +right_context=4 # amount of right-context per eg. +left_context_initial=-1 # if >=0, left-context for first chunk of an utterance +right_context_final=-1 # if >=0, right-context for last chunk of an utterance +compress=true # set this to false to disable compression (e.g. if you want to see whether + # results are affected). + +num_utts_subset=300 # number of utterances in validation and training + # subsets used for shrinkage and diagnostics. +num_valid_egs_combine=0 # #validation examples for combination weights at the very end. +num_train_egs_combine=1000 # number of train examples for the above. +num_egs_diagnostic=400 # number of frames for "compute_prob" jobs +frames_per_iter=400000 # each iteration of training, see this many frames per + # job, measured at the sampling rate of the features + # used. This is just a guideline; it will pick a number + # that divides the number of samples in the entire data. + +transform_dir= # If supplied, overrides latdir as the place to find fMLLR transforms + +stage=0 +max_jobs_run=15 # This should be set to the maximum number of jobs you are + # comfortable to run in parallel; you can increase it if your disk + # speed is greater and you have more machines. +max_shuffle_jobs_run=50 +srand=0 # rand seed for nnet3-chain-get-egs-post, nnet3-chain-copy-egs and nnet3-chain-shuffle-egs +online_ivector_dir= # can be used if we are including speaker information as iVectors. +cmvn_opts= # can be used for specifying CMVN options, if feature type is not lda (if lda, + # it doesn't make sense to use different options than were used as input to the + # LDA transform). This is used to turn off CMVN in the online-nnet experiments. +lattice_lm_scale= # If supplied, the graph/lm weight of the lattices will be + # used (with this scale) in generating supervisions +egs_weight=1.0 # The weight which determines how much each training example + # contributes to gradients while training (can be used + # to down/up-weight a dataset) +lattice_prune_beam= # If supplied, the lattices will be pruned to this beam, + # before being used to get supervisions. +acwt=0.1 # For pruning +phone_insertion_penalty= +deriv_weights_scp= +generate_egs_scp=false +no_chunking=false + +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + + +if [ $# != 4 ]; then + echo "Usage: $0 [opts] " + echo " e.g.: $0 data/train exp/tri4_nnet exp/tri3_lats exp/tri4_nnet/egs" + echo "" + echo "From , 0.trans_mdl (the transition-model), tree (the tree)" + echo "and normalization.fst (the normalization FST, derived from the denominator FST)" + echo "are read." + echo "" + echo "Main options (for others, see top of script file)" + echo " --config # config file containing options" + echo " --max-jobs-run # The maximum number of jobs you want to run in" + echo " # parallel (increase this only if you have good disk and" + echo " # network speed). default=6" + echo " --cmd (utils/run.pl;utils/queue.pl ) # how to run jobs." + echo " --frames-per-iter <#samples;400000> # Number of frames of data to process per iteration, per" + echo " # process." + echo " --frame-subsampling-factor # factor by which num-frames at nnet output is reduced " + echo " --frames-per-eg # number of supervised frames per eg on disk" + echo " --frames-overlap-per-eg # number of supervised frames of overlap between egs" + echo " --left-context # Number of frames on left side to append for feature input" + echo " --right-context # Number of frames on right side to append for feature input" + echo " --left-context-initial # If >= 0, left-context for first chunk of an utterance" + echo " --right-context-final # If >= 0, right-context for last chunk of an utterance" + echo " --num-egs-diagnostic <#frames;4000> # Number of egs used in computing (train,valid) diagnostics" + echo " --num-valid-egs-combine <#frames;10000> # Number of egs used in getting combination weights at the" + echo " # very end." + echo " --stage # Used to run a partially-completed training process from somewhere in" + echo " # the middle." + + exit 1; +fi + +data=$1 +chaindir=$2 +latdir=$3 +dir=$4 + +# Check some files. +[ ! -z "$online_ivector_dir" ] && \ + extra_files="$online_ivector_dir/ivector_online.scp $online_ivector_dir/ivector_period" + +$no_chunking && extra_files="$extra_files $data/allowed_lengths.txt" + +for f in $data/feats.scp $latdir/lat.1.gz $latdir/final.mdl \ + $chaindir/{0.trans_mdl,tree,normalization.fst} $extra_files; do + [ ! -f $f ] && echo "$0: no such file $f" && exit 1; +done + +nj=$(cat $latdir/num_jobs) || exit 1 + +sdata=$data/split$nj +utils/split_data.sh $data $nj + +mkdir -p $dir/log $dir/info + +# Get list of validation utterances. + +frame_shift=$(utils/data/get_frame_shift.sh $data) || exit 1 +utils/data/get_utt2dur.sh $data + +if $no_chunking; then + frames_per_eg=$(cat $data/allowed_lengths.txt | tr '\n' , | sed 's/,$//') + + cut -d ' ' -f 1 $data/utt2spk | \ + utils/shuffle_list.pl | head -$num_utts_subset > $dir/valid_uttlist || exit 1; +else + cat $data/utt2dur | \ + awk -v min_len=$frames_per_eg -v fs=$frame_shift '{if ($2 * 1/fs >= min_len) print $1}' | \ + utils/shuffle_list.pl | head -$num_utts_subset > $dir/valid_uttlist || exit 1; +fi + +len_uttlist=`wc -l $dir/valid_uttlist | awk '{print $1}'` +if [ $len_uttlist -lt $num_utts_subset ]; then + echo "Number of utterances which have length at least $frames_per_eg is really low. Please check your data." && exit 1; +fi + +if [ -f $data/utt2uniq ]; then # this matters if you use data augmentation. + # because of this stage we can again have utts with lengths less than + # frames_per_eg + echo "File $data/utt2uniq exists, so augmenting valid_uttlist to" + echo "include all perturbed versions of the same 'real' utterances." + mv $dir/valid_uttlist $dir/valid_uttlist.tmp + utils/utt2spk_to_spk2utt.pl $data/utt2uniq > $dir/uniq2utt + cat $dir/valid_uttlist.tmp | utils/apply_map.pl $data/utt2uniq | \ + sort | uniq | utils/apply_map.pl $dir/uniq2utt | \ + awk '{for(n=1;n<=NF;n++) print $n;}' | sort > $dir/valid_uttlist + rm $dir/uniq2utt $dir/valid_uttlist.tmp +fi + +if $no_chunking; then + cut -d ' ' -f 1 $data/utt2spk | \ + utils/filter_scp.pl --exclude $dir/valid_uttlist | \ + utils/shuffle_list.pl | head -$num_utts_subset > $dir/train_subset_uttlist || exit 1; +else + cat $data/utt2dur | \ + awk -v min_len=$frames_per_eg -v fs=$frame_shift '{if ($2 * 1/fs >= min_len) print $1}' | \ + utils/filter_scp.pl --exclude $dir/valid_uttlist | \ + utils/shuffle_list.pl | head -$num_utts_subset > $dir/train_subset_uttlist || exit 1; +fi + +len_uttlist=`wc -l $dir/train_subset_uttlist | awk '{print $1}'` +if [ $len_uttlist -lt $num_utts_subset ]; then + echo "Number of utterances which have length at least $frames_per_eg is really low. Please check your data." && exit 1; +fi + +[ -z "$transform_dir" ] && transform_dir=$latdir + +# because we'll need the features with a different number of jobs than $latdir, +# copy to ark,scp. +if [ -f $transform_dir/raw_trans.1 ]; then + echo "$0: using raw transforms from $transform_dir" + if [ $stage -le 0 ]; then + $cmd $dir/log/copy_transforms.log \ + copy-feats "ark:cat $transform_dir/raw_trans.* |" "ark,scp:$dir/trans.ark,$dir/trans.scp" + fi +fi + +## Set up features. +echo "$0: feature type is raw" +feats="ark,s,cs:utils/filter_scp.pl --exclude $dir/valid_uttlist $sdata/JOB/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:- ark:- |" +valid_feats="ark,s,cs:utils/filter_scp.pl $dir/valid_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- |" +train_subset_feats="ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- |" +echo $cmvn_opts >$dir/cmvn_opts # caution: the top-level nnet training script should copy this to its own dir now. + +if [ -f $dir/trans.scp ]; then + feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk scp:$dir/trans.scp ark:- ark:- |" + valid_feats="$valid_feats transform-feats --utt2spk=ark:$data/utt2spk scp:$dir/trans.scp ark:- ark:- |" + train_subset_feats="$train_subset_feats transform-feats --utt2spk=ark:$data/utt2spk scp:$dir/trans.scp ark:- ark:- |" +fi + +tree-info $chaindir/tree | grep num-pdfs | awk '{print $2}' > $dir/info/num_pdfs || exit 1 + +if [ ! -z "$online_ivector_dir" ]; then + ivector_dim=$(feat-to-dim scp:$online_ivector_dir/ivector_online.scp -) || exit 1; + echo $ivector_dim > $dir/info/ivector_dim + steps/nnet2/get_ivector_id.sh $online_ivector_dir > $dir/info/final.ie.id || exit 1 + ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1; + ivector_opts="--online-ivectors=scp:$online_ivector_dir/ivector_online.scp --online-ivector-period=$ivector_period" +else + ivector_opts="" + echo 0 >$dir/info/ivector_dim +fi + +if [ $stage -le 1 ]; then + echo "$0: working out number of frames of training data" + num_frames=$(steps/nnet2/get_num_frames.sh $data) + echo $num_frames > $dir/info/num_frames + echo "$0: working out feature dim" + feats_one="$(echo $feats | sed s/JOB/1/g)" + if ! feat_dim=$(feat-to-dim "$feats_one" - 2>/dev/null); then + echo "Command failed (getting feature dim): feat-to-dim \"$feats_one\"" + exit 1 + fi + echo $feat_dim > $dir/info/feat_dim +else + num_frames=$(cat $dir/info/num_frames) || exit 1; + feat_dim=$(cat $dir/info/feat_dim) || exit 1; +fi + +# the + 1 is to round up, not down... we assume it doesn't divide exactly. +num_archives=$[$num_frames/$frames_per_iter+1] + +# We may have to first create a smaller number of larger archives, with number +# $num_archives_intermediate, if $num_archives is more than the maximum number +# of open filehandles that the system allows per process (ulimit -n). +# This sometimes gives a misleading answer as GridEngine sometimes changes the +# limit, so we limit it to 512. +max_open_filehandles=$(ulimit -n) || exit 1 +[ $max_open_filehandles -gt 512 ] && max_open_filehandles=512 +num_archives_intermediate=$num_archives +archives_multiple=1 +while [ $[$num_archives_intermediate+4] -gt $max_open_filehandles ]; do + archives_multiple=$[$archives_multiple+1] + num_archives_intermediate=$[$num_archives/$archives_multiple] || exit 1; +done +# now make sure num_archives is an exact multiple of archives_multiple. +num_archives=$[$archives_multiple*$num_archives_intermediate] || exit 1; + +echo $num_archives >$dir/info/num_archives +echo $frames_per_eg >$dir/info/frames_per_eg +# Work out the number of egs per archive +egs_per_archive=$[$num_frames/($frames_per_eg*$num_archives)] || exit 1; +! [ $egs_per_archive -le $frames_per_iter ] && \ + echo "$0: script error: egs_per_archive=$egs_per_archive not <= frames_per_iter=$frames_per_iter" \ + && exit 1; + +echo $egs_per_archive > $dir/info/egs_per_archive + +echo "$0: creating $num_archives archives, each with $egs_per_archive egs, with" +echo "$0: $frames_per_eg labels per example, and (left,right) context = ($left_context,$right_context)" +if [ $left_context_initial -ge 0 ] || [ $right_context_final -ge 0 ]; then + echo "$0: ... and (left-context-initial,right-context-final) = ($left_context_initial,$right_context_final)" +fi + + +if [ -e $dir/storage ]; then + # Make soft links to storage directories, if distributing this way.. See + # utils/create_split_dir.pl. + echo "$0: creating data links" + utils/create_data_link.pl $(for x in $(seq $num_archives); do echo $dir/cegs.$x.ark; done) + for x in $(seq $num_archives_intermediate); do + utils/create_data_link.pl $(for y in $(seq $nj); do echo $dir/cegs_orig.$y.$x.ark; done) + done +fi + +egs_opts="--left-context=$left_context --right-context=$right_context --num-frames=$frames_per_eg --frame-subsampling-factor=$frame_subsampling_factor --compress=$compress" +[ $left_context_initial -ge 0 ] && egs_opts="$egs_opts --left-context-initial=$left_context_initial" +[ $right_context_final -ge 0 ] && egs_opts="$egs_opts --right-context-final=$right_context_final" +$no_chunking && egs_opts="$egs_opts --no-chunking" + +[ ! -z "$deriv_weights_scp" ] && egs_opts="$egs_opts --deriv-weights-rspecifier=scp:$deriv_weights_scp" + +chain_supervision_all_opts="--acoustic-scale=$acwt" + +normalization_scale=1.0 + +lats_rspecifier="ark:gunzip -c $latdir/lat.JOB.gz |" +if [ ! -z $lattice_prune_beam ]; then + if [ "$lattice_prune_beam" == "0" ] || [ "$lattice_prune_beam" == "0.0" ]; then + lats_rspecifier="$lats_rspecifier lattice-1best --acoustic-scale=$acwt ark:- ark:- |" + else + lats_rspecifier="$lats_rspecifier lattice-prune --acoustic-scale=$acwt --beam=$lattice_prune_beam ark:- ark:- |" + fi +fi + +if [ ! -z "$lattice_lm_scale" ]; then + chain_supervision_all_opts="$chain_supervision_all_opts --lm-scale=$lattice_lm_scale" +fi + +[ ! -z $phone_insertion_penalty ] && \ + chain_supervision_all_opts="$chain_supervision_all_opts --phone-ins-penalty=$phone_insertion_penalty" + +echo $left_context > $dir/info/left_context +echo $right_context > $dir/info/right_context +echo $left_context_initial > $dir/info/left_context_initial +echo $right_context_final > $dir/info/right_context_final + +if true || [ $stage -le 2 ]; then + echo "$0: Getting validation and training subset examples in background." + rm $dir/.error 2>/dev/null + + ( + $cmd --max-jobs-run $max_jobs_run JOB=1:$nj $dir/log/lattice_copy.JOB.log \ + lattice-copy --include="cat $dir/valid_uttlist $dir/train_subset_uttlist |" --ignore-missing \ + "$lats_rspecifier" \ + ark,scp:$dir/lat_special.JOB.ark,$dir/lat_special.JOB.scp || exit 1 + + for id in $(seq $nj); do cat $dir/lat_special.$id.scp; done > $dir/lat_special.scp + + $cmd $dir/log/create_valid_subset.log \ + utils/filter_scp.pl $dir/valid_uttlist $dir/lat_special.scp \| \ + nnet3-chain-get-egs-post $chain_supervision_all_opts $ivector_opts --srand=$srand \ + $egs_opts $chaindir/normalization.fst \ + $chaindir/0.trans_mdl "$valid_feats" scp:- "ark:$dir/valid_all.cegs" || exit 1 & + $cmd $dir/log/create_train_subset.log \ + utils/filter_scp.pl $dir/train_subset_uttlist $dir/lat_special.scp \| \ + nnet3-chain-get-egs-post $chain_supervision_all_opts $ivector_opts --srand=$srand \ + $egs_opts $chaindir/normalization.fst \ + $chaindir/0.trans_mdl "$train_subset_feats" scp:- \ + "ark:$dir/train_subset_all.cegs" || exit 1 & + wait + sleep 5 # wait for file system to sync. + echo "... Getting subsets of validation examples for diagnostics and combination." + if $generate_egs_scp; then + valid_diagnostic_output="ark,scp:$dir/valid_diagnostic.cegs,$dir/valid_diagnostic.scp" + train_diagnostic_output="ark,scp:$dir/train_diagnostic.cegs,$dir/train_diagnostic.scp" + else + valid_diagnostic_output="ark:$dir/valid_diagnostic.cegs" + train_diagnostic_output="ark:$dir/train_diagnostic.cegs" + fi + $cmd $dir/log/create_valid_subset_combine.log \ + nnet3-chain-subset-egs --n=$num_valid_egs_combine ark:$dir/valid_all.cegs \ + ark:$dir/valid_combine.cegs || exit 1 & + $cmd $dir/log/create_valid_subset_diagnostic.log \ + nnet3-chain-subset-egs --n=$num_egs_diagnostic ark:$dir/valid_all.cegs \ + $valid_diagnostic_output || exit 1 & + + $cmd $dir/log/create_train_subset_combine.log \ + nnet3-chain-subset-egs --n=$num_train_egs_combine ark:$dir/train_subset_all.cegs \ + ark:$dir/train_combine.cegs || exit 1 & + $cmd $dir/log/create_train_subset_diagnostic.log \ + nnet3-chain-subset-egs --n=$num_egs_diagnostic ark:$dir/train_subset_all.cegs \ + $train_diagnostic_output || exit 1 & + wait + sleep 5 # wait for file system to sync. + if $generate_egs_scp; then + cat $dir/valid_combine.cegs $dir/train_combine.cegs | \ + nnet3-chain-copy-egs ark:- ark,scp:$dir/combine.cegs,$dir/combine.scp + rm $dir/{train,valid}_combine.scp + else + cat $dir/valid_combine.cegs $dir/train_combine.cegs > $dir/combine.cegs + fi + + for f in $dir/{combine,train_diagnostic,valid_diagnostic}.cegs; do + [ ! -s $f ] && echo "No examples in file $f" && exit 1; + done + rm $dir/valid_all.cegs $dir/train_subset_all.cegs $dir/{train,valid}_combine.cegs + ) || touch $dir/.error & +fi + +if [ $stage -le 4 ]; then + # create cegs_orig.*.*.ark; the first index goes to $nj, + # the second to $num_archives_intermediate. + + egs_list= + for n in $(seq $num_archives_intermediate); do + egs_list="$egs_list ark:$dir/cegs_orig.JOB.$n.ark" + done + echo "$0: Generating training examples on disk" + + # The examples will go round-robin to egs_list. Note: we omit the + # 'normalization.fst' argument while creating temporary egs: the phase of egs + # preparation that involves the normalization FST is quite CPU-intensive and + # it's more convenient to do it later, in the 'shuffle' stage. Otherwise to + # make it efficient we need to use a large 'nj', like 40, and in that case + # there can be too many small files to deal with, because the total number of + # files is the product of 'nj' by 'num_archives_intermediate', which might be + # quite large. + + $cmd --max-jobs-run $max_jobs_run JOB=1:$nj $dir/log/get_egs.JOB.log \ + nnet3-chain-get-egs-post $chain_supervision_all_opts $ivector_opts --srand=\$[JOB+$srand] \ + $egs_opts --num-frames-overlap=$frames_overlap_per_eg \ + $chaindir/normalization.fst \ + $chaindir/0.trans_mdl "$feats" \ + "$lats_rspecifier" ark:- \| \ + nnet3-chain-copy-egs --random=true --srand=\$[JOB+$srand] ark:- $egs_list || exit 1; +fi + +if [ $stage -le 5 ]; then + echo "$0: recombining and shuffling order of archives on disk" + # combine all the "cegs_orig.*.JOB.scp" (over the $nj splits of the data) and + # shuffle the order, writing to the cegs.JOB.ark + + # the input is a concatenation over the input jobs. + egs_list= + for n in $(seq $nj); do + egs_list="$egs_list $dir/cegs_orig.$n.JOB.ark" + done + + if [ $archives_multiple == 1 ]; then # normal case. + if $generate_egs_scp; then + output_archive="ark,scp:$dir/cegs.JOB.ark,$dir/cegs.JOB.scp" + else + output_archive="ark:$dir/cegs.JOB.ark" + fi + $cmd --max-jobs-run $max_shuffle_jobs_run --mem 8G JOB=1:$num_archives_intermediate $dir/log/shuffle.JOB.log \ + nnet3-chain-shuffle-egs --srand=\$[JOB+$srand] "ark:cat $egs_list|" $output_archive || exit 1; + + if $generate_egs_scp; then + #concatenate cegs.JOB.scp in single cegs.scp + rm -rf $dir/cegs.scp + for j in $(seq $num_archives_intermediate); do + cat $dir/cegs.$j.scp || exit 1; + done > $dir/cegs.scp || exit 1; + for f in $dir/cegs.*.scp; do rm $f; done + fi + else + # we need to shuffle the 'intermediate archives' and then split into the + # final archives. we create soft links to manage this splitting, because + # otherwise managing the output names is quite difficult (and we don't want + # to submit separate queue jobs for each intermediate archive, because then + # the --max-jobs-run option is hard to enforce). + if $generate_egs_scp; then + output_archives="$(for y in $(seq $archives_multiple); do echo ark,scp:$dir/cegs.JOB.$y.ark,$dir/cegs.JOB.$y.scp; done)" + else + output_archives="$(for y in $(seq $archives_multiple); do echo ark:$dir/cegs.JOB.$y.ark; done)" + fi + for x in $(seq $num_archives_intermediate); do + for y in $(seq $archives_multiple); do + archive_index=$[($x-1)*$archives_multiple+$y] + # cegs.intermediate_archive.{1,2,...}.ark will point to cegs.archive.ark + ln -sf cegs.$archive_index.ark $dir/cegs.$x.$y.ark || exit 1 + done + done + $cmd --max-jobs-run $max_shuffle_jobs_run --mem 8G JOB=1:$num_archives_intermediate $dir/log/shuffle.JOB.log \ + nnet3-chain-shuffle-egs --srand=\$[JOB+$srand] "ark:cat $egs_list|" ark:- \| \ + nnet3-chain-copy-egs ark:- $output_archives || exit 1; + + if $generate_egs_scp; then + #concatenate cegs.JOB.scp in single cegs.scp + rm -rf $dir/cegs.scp + for j in $(seq $num_archives_intermediate); do + for y in $(seq $archives_multiple); do + cat $dir/cegs.$j.$y.scp || exit 1; + done + done > $dir/cegs.scp || exit 1; + for f in $dir/cegs.*.*.scp; do rm $f; done + fi + fi +fi + +wait +[ -f $dir/.error ] && echo "Error detected while creating train/valid egs" && exit 1 + +if [ $stage -le 6 ]; then + echo "$0: removing temporary archives" + ( + cd $dir + for f in $(ls -l . | grep 'cegs_orig' | awk '{ X=NF-1; Y=NF-2; if ($X == "->") print $Y, $NF; }'); do rm $f; done + # the next statement removes them if we weren't using the soft links to a + # 'storage' directory. + rm cegs_orig.*.ark 2>/dev/null + ) + if ! $generate_egs_scp && [ $archives_multiple -gt 1 ]; then + # there are some extra soft links that we should delete. + for f in $dir/cegs.*.*.ark; do rm $f; done + fi + # rm $dir/lat_special.*.ark + echo "$0: removing temporary alignments and transforms" + # Ignore errors below because trans.* might not exist. + rm $dir/{ali,trans}.{ark,scp} 2>/dev/null + +fi + +echo "$0: Finished preparing training examples" diff --git a/egs/wsj/s5/steps/nnet3/chain/train_ts.py b/egs/wsj/s5/steps/nnet3/chain/train_ts.py new file mode 100755 index 00000000000..ec86b873195 --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/chain/train_ts.py @@ -0,0 +1,761 @@ +#!/usr/bin/env python + +# Copyright 2016 Vijayaditya Peddinti. +# 2016 Vimal Manohar +# Apache 2.0. + +""" This script is based on steps/nnet3/chain/train.sh +""" + +import argparse +import logging +import os +import pprint +import shutil +import sys +import traceback + +sys.path.insert(0, 'steps') +import libs.nnet3.train.common as common_train_lib +import libs.common as common_lib +import libs.nnet3.train.chain_objf.acoustic_model as chain_lib +import libs.nnet3.train.chain_objf.ts as ts_lib +import libs.nnet3.report.log_parse as nnet3_log_parse + + +logger = logging.getLogger('libs') +logger.setLevel(logging.INFO) +handler = logging.StreamHandler() +handler.setLevel(logging.INFO) +formatter = logging.Formatter("%(asctime)s [%(pathname)s:%(lineno)s - " + "%(funcName)s - %(levelname)s ] %(message)s") +handler.setFormatter(formatter) +logger.addHandler(handler) +logger.info('Starting chain model trainer (train.py)') + + +def get_args(): + """ Get args from stdin. + + We add compulsary arguments as named arguments for readability + + The common options are defined in the object + libs.nnet3.train.common.CommonParser.parser. + See steps/libs/nnet3/train/common.py + """ + + parser = argparse.ArgumentParser( + description="""Trains RNN and DNN acoustic models using the 'chain' + objective function.""", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + conflict_handler='resolve', + parents=[common_train_lib.CommonParser().parser]) + + # egs extraction options + parser.add_argument("--egs.chunk-width", type=str, dest='chunk_width', + default="20", + help="""Number of frames per chunk in the examples + used to train the RNN. Caution: if you double this you + should halve --trainer.samples-per-iter. May be + a comma-separated list of alternatives: first width + is the 'principal' chunk-width, used preferentially""") + + # chain options + parser.add_argument("--chain.lm-opts", type=str, dest='lm_opts', + default=None, action=common_lib.NullstrToNoneAction, + help="options to be be passed to chain-est-phone-lm") + parser.add_argument("--chain.l2-regularize", type=float, + dest='l2_regularize', default=0.0, + help="""Weight of regularization function which is the + l2-norm of the output of the network. It should be used + without the log-softmax layer for the outputs. As + l2-norm of the log-softmax outputs can dominate the + objective function.""") + parser.add_argument("--chain.xent-regularize", type=float, + dest='xent_regularize', default=0.0, + help="Weight of regularization function which is the " + "cross-entropy cost the outputs.") + parser.add_argument("--chain.norm-regularize", type=str, + dest='norm_regularize', default=False, + action=common_lib.StrToBoolAction, + choices=["true", "false"], + help="""If true, instead of l2-regularization on + output of the network, we use l1-regularization on + exp(output) of the network. This tends to make + exp(output) more like probabilities.""") + parser.add_argument("--chain.leaky-hmm-coefficient", type=float, + dest='leaky_hmm_coefficient', default=0.00001, + help="") + parser.add_argument("--chain.smbr-leaky-hmm-coefficient", type=float, + dest='smbr_leaky_hmm_coefficient', default=0.00001, + help="") + parser.add_argument("--chain.apply-deriv-weights", type=str, + dest='apply_deriv_weights', default=True, + action=common_lib.StrToBoolAction, + choices=["true", "false"], + help="") + parser.add_argument("--chain.truncate-deriv-weights", type=int, + dest='truncate_deriv_weights', default=0, + help="""Can be used to set to zero the weights of + derivs from frames near the edges. (counts subsampled + frames)""") + parser.add_argument("--chain.frame-subsampling-factor", type=int, + dest='frame_subsampling_factor', default=3, + help="ratio of frames-per-second of features we " + "train on, to chain model's output") + parser.add_argument("--chain.left-deriv-truncate", type=int, + dest='left_deriv_truncate', + default=None, + help="Deprecated. Kept for back compatibility") + parser.add_argument("--chain.smbr-extra-opts", type=str, + dest='smbr_extra_opts', default=None, + action=common_lib.NullstrToNoneAction, + help="Some additional options related to sMBR") + parser.add_argument("--chain.smbr-factor-schedule", type=str, + dest='smbr_factor_schedule', default=None, + action=common_lib.NullstrToNoneAction, + help="Schedule for sMBR factor in LF-SMBR training.") + parser.add_argument("--chain.mmi-factor-schedule", type=str, + dest='mmi_factor_schedule', default=None, + action=common_lib.NullstrToNoneAction, + help="Schedule for MMI factor in LF-SMBR training.") + parser.add_argument("--chain.smbr-xent-regularize", default=None, + dest='smbr_xent_regularize', type=float, + help="Xent regularizer term used with sMBR training") + parser.add_argument("--chain.smbr-l2-regularize", default=None, + dest='smbr_l2_regularize', type=float, + help="L2 regularizer term used with sMBR training") + + # trainer options + parser.add_argument("--trainer.input-model", type=str, + dest='input_model', default=None, + action=common_lib.NullstrToNoneAction, + help="If specified, this model is used as initial " + "'raw' model (0.raw in the script) instead of " + "initializing the model from the xconfig. " + "Also configs dir is not expected to exist " + "and left/right context is computed from this " + "model.") + parser.add_argument("--trainer.num-epochs", type=float, dest='num_epochs', + default=10.0, + help="Number of epochs to train the model") + parser.add_argument("--trainer.frames-per-iter", type=int, + dest='frames_per_iter', default=800000, + help="""Each iteration of training, see this many + [input] frames per job. This option is passed to + get_egs.sh. Aim for about a minute of training + time""") + + parser.add_argument("--trainer.num-chunk-per-minibatch", type=str, + dest='num_chunk_per_minibatch', default='128', + help="""Number of sequences to be processed in + parallel every minibatch. May be a more general + rule as accepted by the --minibatch-size option of + nnet3-merge-egs; run that program without args to see + the format.""") + + # Parameters for the optimization + parser.add_argument("--trainer.optimization.initial-effective-lrate", + type=float, dest='initial_effective_lrate', + default=0.0002, + help="Learning rate used during the initial iteration") + parser.add_argument("--trainer.optimization.final-effective-lrate", + type=float, dest='final_effective_lrate', + default=0.00002, + help="Learning rate used during the final iteration") + parser.add_argument("--trainer.optimization.shrink-value", type=float, + dest='shrink_value', default=1.0, + help="""Scaling factor used for scaling the parameter + matrices when the derivative averages are below the + shrink-threshold at the non-linearities. E.g. 0.99. + Only applicable when the neural net contains sigmoid or + tanh units.""") + parser.add_argument("--trainer.optimization.shrink-saturation-threshold", + type=float, + dest='shrink_saturation_threshold', default=0.40, + help="""Threshold that controls when we apply the + 'shrinkage' (i.e. scaling by shrink-value). If the + saturation of the sigmoid and tanh nonlinearities in + the neural net (as measured by + steps/nnet3/get_saturation.pl) exceeds this threshold + we scale the parameter matrices with the + shrink-value.""") + # RNN-specific training options + parser.add_argument("--trainer.deriv-truncate-margin", type=int, + dest='deriv_truncate_margin', default=None, + help="""(Relevant only for recurrent models). If + specified, gives the margin (in input frames) around + the 'required' part of each chunk that the derivatives + are backpropagated to. If unset, the derivatives are + backpropagated all the way to the boundaries of the + input data. E.g. 8 is a reasonable setting. Note: the + 'required' part of the chunk is defined by the model's + {left,right}-context.""") + + parser.add_argument("--lang", type=str, + help="Lang directory to get silence pdfs.") + + # General options + parser.add_argument("--feat-dir", type=str, required=True, + help="Directory with features used for training " + "the neural network.") + parser.add_argument("--tree-dir", type=str, required=True, + help="""Directory containing the tree to use for this + model (we also expect final.mdl and ali.*.gz in that + directory""") + parser.add_argument("--lat-dir", type=str, required=True, + help="Directory with numerator lattices " + "used for training the neural network.") + parser.add_argument("--dir", type=str, required=True, + help="Directory to store the models and " + "all other files.") + + print(' '.join(sys.argv)) + print(sys.argv) + + args = parser.parse_args() + + [args, run_opts] = process_args(args) + + return [args, run_opts] + + +def process_args(args): + """ Process the options got from get_args() + """ + + if not common_train_lib.validate_chunk_width(args.chunk_width): + raise Exception("--egs.chunk-width has an invalid value") + + if not common_train_lib.validate_minibatch_size_str(args.num_chunk_per_minibatch): + raise Exception("--trainer.num-chunk-per-minibatch has an invalid value") + + if args.chunk_left_context < 0: + raise Exception("--egs.chunk-left-context should be non-negative") + + if args.chunk_right_context < 0: + raise Exception("--egs.chunk-right-context should be non-negative") + + if args.left_deriv_truncate is not None: + args.deriv_truncate_margin = -args.left_deriv_truncate + logger.warning( + "--chain.left-deriv-truncate (deprecated) is set by user, and " + "--trainer.deriv-truncate-margin is set to negative of that " + "value={0}. We recommend using the option " + "--trainer.deriv-truncate-margin.".format( + args.deriv_truncate_margin)) + + if (not os.path.exists(args.dir) + or (not os.path.exists(args.dir+"/configs") and + (args.input_model is None or not os.path.exists(args.input_model)))): + raise Exception("This script expects {0} to exist. Also either " + "--trainer.input-model option as initial 'raw' model " + "(used as 0.raw in the script) should be supplied or " + "{0}/configs directory which is the output of " + "make_configs.py script should be provided." + "".format(args.dir)) + + if args.transform_dir is None: + args.transform_dir = args.lat_dir + # set the options corresponding to args.use_gpu + run_opts = common_train_lib.RunOpts() + if args.use_gpu: + if not common_lib.check_if_cuda_compiled(): + logger.warning( + """You are running with one thread but you have not compiled + for CUDA. You may be running a setup optimized for GPUs. + If you have GPUs and have nvcc installed, go to src/ and do + ./configure; make""") + + run_opts.train_queue_opt = "--gpu 1" + run_opts.parallel_train_opts = "" + run_opts.combine_queue_opt = "--gpu 1" + + else: + logger.warning("Without using a GPU this will be very slow. " + "nnet3 does not yet support multiple threads.") + + run_opts.train_queue_opt = "" + run_opts.parallel_train_opts = "--use-gpu=no" + run_opts.combine_queue_opt = "" + + run_opts.command = args.command + run_opts.egs_command = (args.egs_command + if args.egs_command is not None else + args.command) + + return [args, run_opts] + + +def get_silence_pdfs(args): + if args.lang is None: + return "" + + out = common_lib.get_command_stdout( + "am-info {0}/0.trans_mdl | grep transition-ids".format(args.dir)) + num_tids = int(out.split()[-1]) + + out = common_lib.get_command_stdout( + "seq -s ' ' 0 {num_tids} | ali-to-pdf " + "{dir}/0.trans_mdl ark,t:- ark,t:-" + "".format(num_tids=num_tids-1, dir=args.dir)) + pdfs = [int(x) for x in out.split()[1:]] + + out = common_lib.get_command_stdout( + "seq -s ' ' 0 {num_tids} | ali-to-phones --per-frame " + "{dir}/0.trans_mdl ark,t:- ark,t:-" + "".format(num_tids=num_tids-1, dir=args.dir)) + phones = [int(x) for x in out.split()[1:]] + + silence_phones_list = open( + "{lang}/phones/silence.int" + "".format(lang=args.lang)).readline() + silence_phones = set([int(x) for x in silence_phones_list.split(":")]) + + silence_pdfs = list(set([str(pdfs[i]) for i, ph in enumerate(phones) + if ph in silence_phones])) + return ",".join(sorted(silence_pdfs)) + + +def train(args, run_opts): + """ The main function for training. + + Args: + args: a Namespace object with the required parameters + obtained from the function process_args() + run_opts: RunOpts object obtained from the process_args() + """ + + arg_string = pprint.pformat(vars(args)) + logger.info("Arguments for the experiment\n{0}".format(arg_string)) + + # Check files + chain_lib.check_for_required_files(args.feat_dir, args.tree_dir, + args.lat_dir if args.egs_dir is None + else None) + + # Set some variables. + num_jobs = common_lib.get_number_of_jobs(args.tree_dir) + feat_dim = common_lib.get_feat_dim(args.feat_dir) + ivector_dim = common_lib.get_ivector_dim(args.online_ivector_dir) + ivector_id = common_lib.get_ivector_extractor_id(args.online_ivector_dir) + + # split the training data into parts for individual jobs + # we will use the same number of jobs as that used for alignment + common_lib.execute_command("utils/split_data.sh {0} {1}" + "".format(args.feat_dir, num_jobs)) + with open('{0}/num_jobs'.format(args.dir), 'w') as f: + f.write(str(num_jobs)) + + if args.input_model is None: + config_dir = '{0}/configs'.format(args.dir) + var_file = '{0}/vars'.format(config_dir) + + variables = common_train_lib.parse_generic_config_vars_file(var_file) + else: + # If args.input_model is specified, the model left and right contexts + # are computed using input_model. + variables = common_train_lib.get_input_model_info(args.input_model) + + # Set some variables. + try: + model_left_context = variables['model_left_context'] + model_right_context = variables['model_right_context'] + except KeyError as e: + raise Exception("KeyError {0}: Variables need to be defined in " + "{1}".format(str(e), '{0}/configs'.format(args.dir))) + + left_context = args.chunk_left_context + model_left_context + right_context = args.chunk_right_context + model_right_context + left_context_initial = (args.chunk_left_context_initial + model_left_context if + args.chunk_left_context_initial >= 0 else -1) + right_context_final = (args.chunk_right_context_final + model_right_context if + args.chunk_right_context_final >= 0 else -1) + + # Initialize as "raw" nnet, prior to training the LDA-like preconditioning + # matrix. This first config just does any initial splicing that we do; + # we do this as it's a convenient way to get the stats for the 'lda-like' + # transform. + if (args.stage <= -6): + logger.info("Creating phone language-model") + chain_lib.create_phone_lm(args.dir, args.tree_dir, run_opts, + lm_opts=args.lm_opts) + + if (args.stage <= -5): + logger.info("Creating denominator FST") + shutil.copy('{0}/tree'.format(args.tree_dir), args.dir) + chain_lib.create_denominator_fst(args.dir, args.tree_dir, run_opts) + + if ((args.stage <= -4) and + os.path.exists("{0}/configs/init.config".format(args.dir)) + and (args.input_model is None)): + logger.info("Initializing a basic network for estimating " + "preconditioning matrix") + common_lib.execute_command( + """{command} {dir}/log/nnet_init.log \ + nnet3-init --srand=-2 {dir}/configs/init.config \ + {dir}/init.raw""".format(command=run_opts.command, + dir=args.dir)) + + egs_left_context = left_context + args.frame_subsampling_factor / 2 + egs_right_context = right_context + args.frame_subsampling_factor / 2 + # note: the '+ args.frame_subsampling_factor / 2' is to allow for the + # fact that we'll be shifting the data slightly during training to give + # variety to the training data. + egs_left_context_initial = (left_context_initial + + args.frame_subsampling_factor / 2 if + left_context_initial >= 0 else -1) + egs_right_context_final = (right_context_final + + args.frame_subsampling_factor / 2 if + right_context_final >= 0 else -1) + + default_egs_dir = '{0}/egs'.format(args.dir) + if ((args.stage <= -3) and args.egs_dir is None): + logger.info("Generating egs using {0}".format("steps/nnet3/chain/get_egs_ts.sh")) + if (not os.path.exists("{0}/den.fst".format(args.dir)) or + not os.path.exists("{0}/normalization.fst".format(args.dir)) or + not os.path.exists("{0}/tree".format(args.dir))): + raise Exception("Chain egs generation expects {0}/den.fst, " + "{0}/normalization.fst and {0}/tree " + "to exist.".format(args.dir)) + # this is where get_egs.sh is called. + ts_lib.generate_chain_egs( + dir=args.dir, data=args.feat_dir, + lat_dir=args.lat_dir, egs_dir=default_egs_dir, + left_context=egs_left_context, + right_context=egs_right_context, + left_context_initial=egs_left_context_initial, + right_context_final=egs_right_context_final, + run_opts=run_opts, + frame_subsampling_factor=args.frame_subsampling_factor, + frames_per_eg_str=args.chunk_width, + srand=args.srand, + egs_opts=args.egs_opts, + cmvn_opts=args.cmvn_opts, + online_ivector_dir=args.online_ivector_dir, + frames_per_iter=args.frames_per_iter, + transform_dir=args.transform_dir, + stage=args.egs_stage) + + if args.egs_dir is None: + egs_dir = default_egs_dir + else: + egs_dir = args.egs_dir + + [egs_left_context, egs_right_context, + frames_per_eg_str, num_archives] = ( + common_train_lib.verify_egs_dir(egs_dir, feat_dim, + ivector_dim, ivector_id, + egs_left_context, egs_right_context, + egs_left_context_initial, + egs_right_context_final)) + assert(args.chunk_width == frames_per_eg_str) + num_archives_expanded = num_archives * args.frame_subsampling_factor + + if (args.num_jobs_final > num_archives_expanded): + raise Exception('num_jobs_final cannot exceed the ' + 'expanded number of archives') + + # copy the properties of the egs to dir for + # use during decoding + logger.info("Copying the properties from {0} to {1}".format(egs_dir, args.dir)) + common_train_lib.copy_egs_properties_to_exp_dir(egs_dir, args.dir) + + if not os.path.exists('{0}/valid_diagnostic.egs'.format(egs_dir)): + if (not os.path.exists('{0}/valid_diagnostic.scp'.format(egs_dir))): + raise Exception('neither {0}/valid_diagnostic.egs nor ' + '{0}/valid_diagnostic.scp exist.' + 'This script expects one of them.'.format(egs_dir)) + use_multitask_egs = True + else: + use_multitask_egs = False + + if ((args.stage <= -2) and (os.path.exists(args.dir+"/configs/init.config")) + and (args.input_model is None)): + logger.info('Computing the preconditioning matrix for input features') + + chain_lib.compute_preconditioning_matrix( + args.dir, egs_dir, num_archives, run_opts, + max_lda_jobs=args.max_lda_jobs, + rand_prune=args.rand_prune, + use_multitask_egs=use_multitask_egs) + + if (args.stage <= -1): + logger.info("Preparing the initial acoustic model.") + chain_lib.prepare_initial_acoustic_model(args.dir, run_opts, + input_model=args.input_model) + + with open("{0}/frame_subsampling_factor".format(args.dir), "w") as f: + f.write(str(args.frame_subsampling_factor)) + + # set num_iters so that as close as possible, we process the data + # $num_epochs times, i.e. $num_iters*$avg_num_jobs) == + # $num_epochs*$num_archives, where + # avg_num_jobs=(num_jobs_initial+num_jobs_final)/2. + num_archives_to_process = int(args.num_epochs * num_archives_expanded) + num_archives_processed = 0 + num_iters = ((num_archives_to_process * 2) + / (args.num_jobs_initial + args.num_jobs_final)) + + # If do_final_combination is True, compute the set of models_to_combine. + # Otherwise, models_to_combine will be none. + if args.do_final_combination: + models_to_combine = common_train_lib.get_model_combine_iters( + num_iters, args.num_epochs, + num_archives_expanded, args.max_models_combine, + args.num_jobs_final) + else: + models_to_combine = None + + min_deriv_time = None + max_deriv_time_relative = None + if args.deriv_truncate_margin is not None: + min_deriv_time = -args.deriv_truncate_margin - model_left_context + max_deriv_time_relative = \ + args.deriv_truncate_margin + model_right_context + + silence_pdfs = get_silence_pdfs(args) + + logger.info("Training will run for {0} epochs = " + "{1} iterations".format(args.num_epochs, num_iters)) + + for iter in range(num_iters): + if (args.exit_stage is not None) and (iter == args.exit_stage): + logger.info("Exiting early due to --exit-stage {0}".format(iter)) + return + current_num_jobs = int(0.5 + args.num_jobs_initial + + (args.num_jobs_final - args.num_jobs_initial) + * float(iter) / num_iters) + + if args.stage <= iter: + model_file = "{dir}/{iter}.mdl".format(dir=args.dir, iter=iter) + + lrate = common_train_lib.get_learning_rate(iter, current_num_jobs, + num_iters, + num_archives_processed, + num_archives_to_process, + args.initial_effective_lrate, + args.final_effective_lrate) + shrinkage_value = 1.0 - (args.proportional_shrink * lrate) + if shrinkage_value <= 0.5: + raise Exception("proportional-shrink={0} is too large, it gives " + "shrink-value={1}".format(args.proportional_shrink, + shrinkage_value)) + if args.shrink_value < shrinkage_value: + shrinkage_value = (args.shrink_value + if common_train_lib.should_do_shrinkage( + iter, model_file, + args.shrink_saturation_threshold) + else shrinkage_value) + + xent_regularize = args.xent_regularize + l2_regularize = args.l2_regularize + objective_opts = ("--objective-scales=" + args.objective_scales + if args.objective_scales is not None else "") + smbr_factor = 0.0 + if args.smbr_factor_schedule is not None: + smbr_factor = common_train_lib.get_schedule_value( + args.smbr_factor_schedule, + float(num_archives_processed) / num_archives_to_process) + + objective_opts += " --smbr-factor={0}".format(smbr_factor) + + if smbr_factor > 0.0: + use_smbr=True + xent_regularize = (args.smbr_xent_regularize + if args.smbr_xent_regularize is not None + else args.xent_regularize) + l2_regularize = (args.smbr_l2_regularize + if args.smbr_l2_regularize is not None + else args.l2_regularize) + objective_opts += " --use-smbr-objective" + if silence_pdfs is not None: + objective_opts += " --silence-pdfs=" + silence_pdfs + if args.smbr_extra_opts is not None: + objective_opts += " " + args.smbr_extra_opts + + if args.mmi_factor_schedule is not None: + mmi_factor = common_train_lib.get_schedule_value( + args.mmi_factor_schedule, + float(num_archives_processed) / num_archives_to_process) + + objective_opts += " --mmi-factor={0}".format(mmi_factor) + + objective_opts += " --norm-regularize={0}".format( + "true" if args.norm_regularize else "false") + + percent = num_archives_processed * 100.0 / num_archives_to_process + epoch = (num_archives_processed * args.num_epochs + / num_archives_to_process) + shrink_info_str = '' + if shrinkage_value != 1.0: + shrink_info_str = 'shrink: {0:0.5f}'.format(shrinkage_value) + logger.info("Iter: {0}/{1} " + "Epoch: {2:0.2f}/{3:0.1f} ({4:0.1f}% complete) " + "lr: {5:0.6f} {6}".format(iter, num_iters - 1, + epoch, args.num_epochs, + percent, + lrate, shrink_info_str)) + + chain_lib.train_one_iteration( + dir=args.dir, + iter=iter, + srand=args.srand, + egs_dir=egs_dir, + num_jobs=current_num_jobs, + num_archives_processed=num_archives_processed, + num_archives=num_archives, + learning_rate=lrate, + dropout_edit_string=common_train_lib.get_dropout_edit_string( + args.dropout_schedule, + float(num_archives_processed) / num_archives_to_process, + iter), + shrinkage_value=shrinkage_value, + num_chunk_per_minibatch_str=args.num_chunk_per_minibatch, + apply_deriv_weights=args.apply_deriv_weights, + min_deriv_time=min_deriv_time, + max_deriv_time_relative=max_deriv_time_relative, + l2_regularize=l2_regularize, + xent_regularize=xent_regularize, + leaky_hmm_coefficient=(args.smbr_leaky_hmm_coefficient + if smbr_factor > 0.0 + else args.leaky_hmm_coefficient), + momentum=args.momentum, + max_param_change=args.max_param_change, + shuffle_buffer_size=args.shuffle_buffer_size, + frame_subsampling_factor=args.frame_subsampling_factor, + truncate_deriv_weights=args.truncate_deriv_weights, + run_opts=run_opts, + backstitch_training_scale=args.backstitch_training_scale, + backstitch_training_interval=args.backstitch_training_interval, + use_multitask_egs=use_multitask_egs, + objective_opts=objective_opts) + + if args.cleanup: + # do a clean up everythin but the last 2 models, under certain + # conditions + common_train_lib.remove_model( + args.dir, iter-2, num_iters, models_to_combine, + args.preserve_model_interval) + + if args.email is not None: + reporting_iter_interval = num_iters * args.reporting_interval + if iter % reporting_iter_interval == 0: + # lets do some reporting + [report, times, data] = ( + nnet3_log_parse.generate_acc_logprob_report( + args.dir, "log-probability")) + message = report + subject = ("Update : Expt {dir} : " + "Iter {iter}".format(dir=args.dir, iter=iter)) + common_lib.send_mail(message, subject, args.email) + + num_archives_processed = num_archives_processed + current_num_jobs + + if args.stage <= num_iters: + xent_regularize = args.xent_regularize + l2_regularize = args.l2_regularize + objective_opts = ("--objective-scales=" + args.objective_scales + if args.objective_scales is not None else "") + smbr_factor = 0.0 + if args.smbr_factor_schedule is not None: + smbr_factor = common_train_lib.get_schedule_value( + args.smbr_factor_schedule, 1.0) + + objective_opts += " --smbr-factor={0}".format(smbr_factor) + + if smbr_factor > 0.0: + use_smbr=True + xent_regularize = (args.smbr_xent_regularize + if args.smbr_xent_regularize is not None + else args.xent_regularize) + l2_regularize = (args.smbr_l2_regularize + if args.smbr_l2_regularize is not None + else args.l2_regularize) + objective_opts = "--use-smbr-objective" + if silence_pdfs is not None: + objective_opts += " --silence-pdfs=" + silence_pdfs + + if args.mmi_factor_schedule is not None: + mmi_factor = common_train_lib.get_schedule_value( + args.mmi_factor_schedule, 1.0) + + objective_opts += " --mmi-factor={0}".format(mmi_factor) + + if args.do_final_combination: + logger.info("Doing final combination to produce final.mdl") + + chain_lib.combine_models( + dir=args.dir, num_iters=num_iters, + models_to_combine=models_to_combine, + num_chunk_per_minibatch_str=args.num_chunk_per_minibatch, + egs_dir=egs_dir, + leaky_hmm_coefficient=args.leaky_hmm_coefficient, + l2_regularize=l2_regularize, + xent_regularize=xent_regularize, + run_opts=run_opts, + max_objective_evaluations=args.max_objective_evaluations, + use_multitask_egs=use_multitask_egs, + objective_opts=objective_opts) + else: + logger.info("Copying the last-numbered model to final.mdl") + common_lib.force_symlink("{0}.mdl".format(num_iters), + "{0}/final.mdl".format(args.dir)) + chain_lib.compute_train_cv_probabilities( + dir=args.dir, iter=num_iters, egs_dir=egs_dir, + l2_regularize=l2_regularize, xent_regularize=xent_regularize, + leaky_hmm_coefficient=args.leaky_hmm_coefficient, + run_opts=run_opts, + use_multitask_egs=use_multitask_egs, + objective_opts=objective_opts) + common_lib.force_symlink("compute_prob_valid.{iter}.log" + "".format(iter=num_iters-1), + "{dir}/log/compute_prob_valid.final.log".format( + dir=args.dir)) + + if args.cleanup: + logger.info("Cleaning up the experiment directory " + "{0}".format(args.dir)) + remove_egs = args.remove_egs + if args.egs_dir is not None: + # this egs_dir was not created by this experiment so we will not + # delete it + remove_egs = False + + common_train_lib.clean_nnet_dir( + args.dir, num_iters, egs_dir, + preserve_model_interval=args.preserve_model_interval, + remove_egs=remove_egs) + + # do some reporting + [report, times, data] = nnet3_log_parse.generate_acc_logprob_report( + args.dir, "log-probability") + if args.email is not None: + common_lib.send_mail(report, "Update : Expt {0} : " + "complete".format(args.dir), args.email) + + with open("{dir}/accuracy.report".format(dir=args.dir), "w") as f: + f.write(report) + + common_lib.execute_command("steps/info/nnet3_dir_info.pl " + "{0}".format(args.dir)) + + +def main(): + [args, run_opts] = get_args() + try: + train(args, run_opts) + common_lib.wait_for_background_commands() + except BaseException as e: + # look for BaseException so we catch KeyboardInterrupt, which is + # what we get when a background thread dies. + if args.email is not None: + message = ("Training session for experiment {dir} " + "died due to an error.".format(dir=args.dir)) + common_lib.send_mail(message, message, args.email) + if not isinstance(e, KeyboardInterrupt): + traceback.print_exc() + sys.exit(1) + +if __name__ == "__main__": + main() diff --git a/egs/wsj/s5/steps/nnet3/get_egs.sh b/egs/wsj/s5/steps/nnet3/get_egs.sh index c8cbf67c8b8..7f0e1365449 100755 --- a/egs/wsj/s5/steps/nnet3/get_egs.sh +++ b/egs/wsj/s5/steps/nnet3/get_egs.sh @@ -400,7 +400,7 @@ if [ $stage -le 5 ]; then #concatenate egs.JOB.scp in single egs.scp rm $dir/egs.scp 2> /dev/null || true for j in $(seq $num_archives_intermediate); do - for y in $(seq $num_archives_intermediate); do + for y in $(seq $archives_multiple); do cat $dir/egs.$j.$y.scp || exit 1; done done > $dir/egs.scp || exit 1; diff --git a/egs/wsj/s5/steps/nnet3/multilingual/allocate_multilingual_examples.py b/egs/wsj/s5/steps/nnet3/multilingual/allocate_multilingual_examples.py index e5f5f627567..860c444e342 100755 --- a/egs/wsj/s5/steps/nnet3/multilingual/allocate_multilingual_examples.py +++ b/egs/wsj/s5/steps/nnet3/multilingual/allocate_multilingual_examples.py @@ -98,6 +98,13 @@ def get_args(): parser.add_argument("--samples-per-iter", type=int, default=40000, help="The target number of egs in each archive of egs, " "(prior to merging egs). ") + parser.add_argument("--frames-per-iter", type=int, default=400000, + help="The target number of frames in each archive of " + "egs") + parser.add_argument("--frames-per-eg-list", type=str, default=None, + action=common_lib.NullstrToNoneAction, + help="Number of frames per eg for each input language " + "as a comma separated list") parser.add_argument("--num-jobs", type=int, default=20, help="This can be used for better randomization in distributing " "examples for different languages across egs.*.scp files, " @@ -107,7 +114,7 @@ def get_args(): help="If true, egs.ranges.*.txt are generated " "randomly w.r.t distribution of remaining examples in " "each language, otherwise it is generated sequentially.", - default=True, choices = ["false", "true"]) + default=True, choices=["false", "true"]) parser.add_argument("--max-archives", type=int, default=1000, help="max number of archives used to generate egs.*.scp") parser.add_argument("--seed", type=int, default=1, @@ -129,7 +136,7 @@ def get_args(): # now the positional arguments parser.add_argument("egs_scp_lists", nargs='+', help="list of egs.scp files per input language." - "e.g. exp/lang1/egs/egs.scp exp/lang2/egs/egs.scp") + "e.g. exp/lang1/egs/egs.scp exp/lang2/egs/egs.scp") parser.add_argument("egs_dir", help="Name of egs directory e.g. exp/tdnn_multilingual_sp/egs") @@ -153,7 +160,7 @@ def select_random_lang(lang_len, tot_egs, random_selection): count = 0 for l in range(len(lang_len)): if random_selection: - if rand_int <= (count + lang_len[l]): + if rand_int <= (count + lang_len[l]): return l else: count += lang_len[l] @@ -172,6 +179,10 @@ def process_multilingual_egs(args): scp_lists = args.egs_scp_lists num_langs = len(scp_lists) + frames_per_eg = ([1 for x in scp_lists] + if args.frames_per_eg_list is None + else [int(x) for x in args.frames_per_eg_list.split(',')]) + scp_files = [open(scp_lists[lang], 'r') for lang in range(num_langs)] lang2len = [0] * num_langs @@ -182,7 +193,7 @@ def process_multilingual_egs(args): # If weights are not provided, the weights are 1.0. if args.lang2weight is None: - lang2weight = [ 1.0 ] * num_langs + lang2weight = [1.0] * num_langs else: lang2weight = args.lang2weight.split(",") assert(len(lang2weight) == num_langs) @@ -195,10 +206,16 @@ def process_multilingual_egs(args): # Each element of all_egs (one per num_archive * num_jobs) is # an array of 3-tuples (lang-id, local-start-egs-line, num-egs) all_egs = [] - lang_len = lang2len[:] - # total num of egs in all languages - tot_num_egs = sum(lang2len[i] for i in range(len(lang2len))) - num_archives = max(1, min(args.max_archives, tot_num_egs // args.samples_per_iter)) + num_frames_in_lang = [frames_per_eg[i] * lang2len[i] + for i in range(num_langs)] + for lang in range(num_langs): + logger.info("Number of frames for language {0} " + "is {1}.".format(lang, num_frames_in_lang[lang])) + + # total num of frames in all languages + tot_num_frames = sum(num_frames_in_lang[i] for i in range(num_langs)) + num_archives = max(1, min(args.max_archives, + tot_num_frames / args.frames_per_iter)) num_arch_file = open("{0}/info/{1}num_archives".format( args.egs_dir, @@ -206,7 +223,7 @@ def process_multilingual_egs(args): "w") print("{0}".format(num_archives), file=num_arch_file) num_arch_file.close() - this_num_egs_per_archive = tot_num_egs // (num_archives * args.num_jobs) + this_num_frames_per_archive = tot_num_frames / (num_archives * args.num_jobs) logger.info("Generating {0}scp.. temporary files used to " "generate {0}.scp.".format(args.egs_prefix)) @@ -216,29 +233,36 @@ def process_multilingual_egs(args): "".format(args.egs_dir, args.egs_prefix, job + 1, archive_index + 1), "w") - this_egs = [] # this will be array of 2-tuples (lang-id start-frame num-frames) + # this will be array of 2-tuples (lang-id start-frame num-frames) + this_egs = [] num_egs = 0 - while num_egs <= this_num_egs_per_archive: - num_left_egs = sum(num_left_egs_per_lang for - num_left_egs_per_lang in lang_len) - if num_left_egs > 0: - lang_id = select_random_lang(lang_len, num_left_egs, rand_select) - start_egs = lang2len[lang_id] - lang_len[lang_id] + num_frames = 0 + while num_frames <= this_num_frames_per_archive: + num_frames_left = sum(num_frames_in_lang) + if num_frames_left > 0: + lang_id = select_random_lang(num_frames_in_lang, + num_frames_left, rand_select) + start_egs = ( + lang2len[lang_id] + - num_frames_in_lang[lang_id] / frames_per_eg[lang_id]) this_egs.append((lang_id, start_egs, args.minibatch_size)) for scpline in range(args.minibatch_size): scp_key = scp_files[lang_id].readline().splitlines()[0] print("{0} {1}".format(scp_key, lang_id), file=archfile) - lang_len[lang_id] = lang_len[lang_id] - args.minibatch_size - num_egs = num_egs + args.minibatch_size + num_frames_in_lang[lang_id] -= ( + args.minibatch_size * frames_per_eg[lang_id]) + num_egs += args.minibatch_size + num_frames += args.minibatch_size * frames_per_eg[lang_id] # If num of remaining egs in each lang is less than minibatch_size, # they are discarded. - if lang_len[lang_id] < args.minibatch_size: - lang_len[lang_id] = 0 - logger.info("Done processing data for language {0}".format( - lang_id)) + if (num_frames_in_lang[lang_id] + < args.minibatch_size * frames_per_eg[lang_id]): + num_frames_in_lang[lang_id] = 0 + logger.info("Done processing data for language {0}" + "".format(lang_id)) else: logger.info("Done processing data for all languages.") break @@ -315,4 +339,4 @@ def main(): if __name__ == "__main__": - main() + main() diff --git a/egs/wsj/s5/steps/segmentation/detect_speech_activity.sh b/egs/wsj/s5/steps/segmentation/detect_speech_activity.sh index b1fda1d42d5..a7c1f7d0b0f 100755 --- a/egs/wsj/s5/steps/segmentation/detect_speech_activity.sh +++ b/egs/wsj/s5/steps/segmentation/detect_speech_activity.sh @@ -99,14 +99,14 @@ data_id=`basename $data_dir` sad_dir=${dir}/${sad_name}${affix}_${data_id}_whole${feat_affix} seg_dir=${dir}/${segmentation_name}${affix}_${data_id}_whole${feat_affix} -test_data_dir=data/${data_id}${feat_affix}_hires - if $convert_data_dir_to_whole; then + test_data_dir=data/${data_id}_whole${feat_affix}_hires if [ $stage -le 0 ]; then rm -r ${test_data_dir} || true utils/data/convert_data_dir_to_whole.sh $src_data_dir ${test_data_dir} fi else + test_data_dir=data/${data_id}${feat_affix}_hires if [ $stage -le 0 ]; then rm -r ${test_data_dir} || true utils/copy_data_dir.sh $src_data_dir $test_data_dir From 94dc65af1d0589a6544ed76820463196558231fc Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Tue, 20 Mar 2018 17:44:05 -0400 Subject: [PATCH 138/174] Multiple smbr factors for outputs --- .../nnet3/train/chain_objf/acoustic_model.py | 27 +++--- .../libs/nnet3/train/dropout_schedule.py | 4 +- egs/wsj/s5/steps/nnet3/chain/train.py | 15 ++-- src/chain/chain-denominator-smbr.cc | 28 +++--- src/chain/chain-denominator-smbr.h | 2 + src/chain/chain-training.h | 20 ++++- src/nnet3/nnet-chain-diagnostics.cc | 87 +++++++++---------- src/nnet3/nnet-chain-diagnostics.h | 4 +- src/nnet3/nnet-chain-training.cc | 20 +++-- src/nnet3/nnet-chain-training.h | 4 +- 10 files changed, 120 insertions(+), 91 deletions(-) diff --git a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py index 7c273456497..468159b11b2 100644 --- a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py +++ b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py @@ -127,7 +127,7 @@ def train_new_models(dir, iter, srand, num_jobs, raw_model_string, egs_dir, apply_deriv_weights, min_deriv_time, max_deriv_time_relative, - l2_regularize, xent_regularize, leaky_hmm_coefficient, + l2_regularize, xent_regularize, momentum, max_param_change, shuffle_buffer_size, num_chunk_per_minibatch_str, frame_subsampling_factor, truncate_deriv_weights, run_opts, train_opts, @@ -192,7 +192,7 @@ def train_new_models(dir, iter, srand, num_jobs, """{command} {train_queue_opt} {dir}/log/train.{iter}.{job}.log \ nnet3-chain-train {parallel_train_opts} {verbose_opt} \ --apply-deriv-weights={app_deriv_wts} {objective_opts} \ - --l2-regularize={l2} --leaky-hmm-coefficient={leaky} \ + --l2-regularize={l2} \ {cache_io_opts} --xent-regularize={xent_reg} \ {deriv_time_opts} \ --print-interval=10 --momentum={momentum} \ @@ -219,7 +219,7 @@ def train_new_models(dir, iter, srand, num_jobs, app_deriv_wts=apply_deriv_weights, fr_shft=frame_shift, l2=l2_regularize, train_opts=train_opts, - xent_reg=xent_regularize, leaky=leaky_hmm_coefficient, + xent_reg=xent_regularize, cache_io_opts=cache_io_opts, parallel_train_opts=run_opts.parallel_train_opts, verbose_opt=verbose_opt, @@ -249,7 +249,6 @@ def train_one_iteration(dir, iter, srand, egs_dir, apply_deriv_weights, min_deriv_time, max_deriv_time_relative, l2_regularize, xent_regularize, - leaky_hmm_coefficient, momentum, max_param_change, shuffle_buffer_size, frame_subsampling_factor, truncate_deriv_weights, run_opts, dropout_edit_string="", train_opts="", @@ -285,7 +284,7 @@ def train_one_iteration(dir, iter, srand, egs_dir, compute_train_cv_probabilities( dir=dir, iter=iter, egs_dir=egs_dir, l2_regularize=l2_regularize, xent_regularize=xent_regularize, - leaky_hmm_coefficient=leaky_hmm_coefficient, run_opts=run_opts, + run_opts=run_opts, use_multitask_egs=use_multitask_egs, objective_opts=objective_opts) @@ -335,7 +334,6 @@ def train_one_iteration(dir, iter, srand, egs_dir, max_deriv_time_relative=max_deriv_time_relative, l2_regularize=l2_regularize, xent_regularize=xent_regularize, - leaky_hmm_coefficient=leaky_hmm_coefficient, momentum=momentum, max_param_change=cur_max_param_change, shuffle_buffer_size=shuffle_buffer_size, @@ -494,7 +492,7 @@ def prepare_initial_acoustic_model(dir, run_opts, srand=-1, input_model=None): def compute_train_cv_probabilities(dir, iter, egs_dir, l2_regularize, - xent_regularize, leaky_hmm_coefficient, + xent_regularize, run_opts, use_multitask_egs=False, objective_opts=""): @@ -514,12 +512,12 @@ def compute_train_cv_probabilities(dir, iter, egs_dir, l2_regularize, common_lib.background_command( """{command} {dir}/log/compute_prob_valid.{iter}.log \ nnet3-chain-compute-prob --l2-regularize={l2} {objective_opts} \ - --leaky-hmm-coefficient={leaky} --xent-regularize={xent_reg} \ + --xent-regularize={xent_reg} \ "nnet3-am-copy --raw=true {model} - |" {dir}/den.fst \ "ark,bg:nnet3-chain-copy-egs {multitask_egs_opts} {scp_or_ark}:{egs_dir}/valid_diagnostic{egs_suffix} \ ark:- | nnet3-chain-merge-egs --minibatch-size=1:64 ark:- ark:- |" \ """.format(command=run_opts.command, dir=dir, iter=iter, model=model, - l2=l2_regularize, leaky=leaky_hmm_coefficient, + l2=l2_regularize, xent_reg=xent_regularize, egs_dir=egs_dir, multitask_egs_opts=multitask_egs_opts, @@ -534,12 +532,12 @@ def compute_train_cv_probabilities(dir, iter, egs_dir, l2_regularize, common_lib.background_command( """{command} {dir}/log/compute_prob_train.{iter}.log \ nnet3-chain-compute-prob --l2-regularize={l2} {objective_opts} \ - --leaky-hmm-coefficient={leaky} --xent-regularize={xent_reg} \ + --xent-regularize={xent_reg} \ "nnet3-am-copy --raw=true {model} - |" {dir}/den.fst \ "ark,bg:nnet3-chain-copy-egs {multitask_egs_opts} {scp_or_ark}:{egs_dir}/train_diagnostic{egs_suffix} \ ark:- | nnet3-chain-merge-egs --minibatch-size=1:64 ark:- ark:- |" \ """.format(command=run_opts.command, dir=dir, iter=iter, model=model, - l2=l2_regularize, leaky=leaky_hmm_coefficient, + l2=l2_regularize, xent_reg=xent_regularize, egs_dir=egs_dir, multitask_egs_opts=multitask_egs_opts, @@ -586,7 +584,7 @@ def compute_progress(dir, iter, run_opts): def combine_models(dir, num_iters, models_to_combine, num_chunk_per_minibatch_str, - egs_dir, leaky_hmm_coefficient, l2_regularize, + egs_dir, l2_regularize, xent_regularize, run_opts, max_objective_evaluations=30, use_multitask_egs=False, @@ -633,7 +631,7 @@ def combine_models(dir, num_iters, models_to_combine, num_chunk_per_minibatch_st """{command} {combine_queue_opt} {dir}/log/combine.log \ nnet3-chain-combine {objective_opts} \ --max-objective-evaluations={max_objective_evaluations} \ - --l2-regularize={l2} --leaky-hmm-coefficient={leaky} \ + --l2-regularize={l2} \ --verbose=3 {combine_gpu_opt} {dir}/den.fst {raw_models} \ "ark,bg:nnet3-chain-copy-egs {multitask_egs_opts} {scp_or_ark}:{egs_dir}/combine{egs_suffix} ark:- | \ nnet3-chain-merge-egs --minibatch-size={num_chunk_per_mb} \ @@ -644,7 +642,7 @@ def combine_models(dir, num_iters, models_to_combine, num_chunk_per_minibatch_st combine_queue_opt=run_opts.combine_queue_opt, combine_gpu_opt=run_opts.combine_gpu_opt, max_objective_evaluations=max_objective_evaluations, - l2=l2_regularize, leaky=leaky_hmm_coefficient, + l2=l2_regularize, dir=dir, raw_models=" ".join(raw_model_strings), num_chunk_per_mb=num_chunk_per_minibatch_str, num_iters=num_iters, @@ -659,7 +657,6 @@ def combine_models(dir, num_iters, models_to_combine, num_chunk_per_minibatch_st compute_train_cv_probabilities( dir=dir, iter='final', egs_dir=egs_dir, l2_regularize=l2_regularize, xent_regularize=xent_regularize, - leaky_hmm_coefficient=leaky_hmm_coefficient, run_opts=run_opts, use_multitask_egs=use_multitask_egs, objective_opts=objective_opts) diff --git a/egs/wsj/s5/steps/libs/nnet3/train/dropout_schedule.py b/egs/wsj/s5/steps/libs/nnet3/train/dropout_schedule.py index 2471abf040f..b89dc171a74 100644 --- a/egs/wsj/s5/steps/libs/nnet3/train/dropout_schedule.py +++ b/egs/wsj/s5/steps/libs/nnet3/train/dropout_schedule.py @@ -222,9 +222,7 @@ def get_schedule_string(schedule, data_fraction): proportion_string.append( "{}:{}".format(component_name, proportion)) - ' '.join(proportion_string) - - return proportions[0][1] + return ' '.join(proportion_string) def get_dropout_edit_string(dropout_schedule, data_fraction, iter_): diff --git a/egs/wsj/s5/steps/nnet3/chain/train.py b/egs/wsj/s5/steps/nnet3/chain/train.py index 3d57fea8c7e..f8ab4220f45 100755 --- a/egs/wsj/s5/steps/nnet3/chain/train.py +++ b/egs/wsj/s5/steps/nnet3/chain/train.py @@ -634,6 +634,11 @@ def train(args, run_opts): percent, lrate, shrink_info_str)) + objective_opts += " --leaky-hmm-coefficient={0} {1}".format( + args.leaky_hmm_coefficient, + "" if args.smbr_leaky_hmm_coefficient is None else + "--smbr-leaky-hmm-coefficient={}".format(args.smbr_leaky_hmm_coefficient)) + chain_lib.train_one_iteration( dir=args.dir, iter=iter, @@ -655,9 +660,6 @@ def train(args, run_opts): max_deriv_time_relative=max_deriv_time_relative, l2_regularize=l2_regularize, xent_regularize=xent_regularize, - leaky_hmm_coefficient=(args.smbr_leaky_hmm_coefficient - if use_smbr_objective and args.smbr_leaky_hmm_coefficient is not None - else args.leaky_hmm_coefficient), momentum=args.momentum, max_param_change=args.max_param_change, shuffle_buffer_size=args.shuffle_buffer_size, @@ -739,6 +741,11 @@ def train(args, run_opts): objective_opts += " --norm-regularize={0}".format( "true" if args.norm_regularize else "false") + objective_opts += " --leaky-hmm-coefficient={0} {1}".format( + args.leaky_hmm_coefficient, + "" if args.smbr_leaky_hmm_coefficient is None else + "--smbr-leaky-hmm-coefficient={}".format(args.smbr_leaky_hmm_coefficient)) + if args.do_final_combination: logger.info("Doing final combination to produce final.mdl") @@ -747,7 +754,6 @@ def train(args, run_opts): models_to_combine=models_to_combine, num_chunk_per_minibatch_str=args.num_chunk_per_minibatch, egs_dir=egs_dir, - leaky_hmm_coefficient=args.leaky_hmm_coefficient, l2_regularize=l2_regularize, xent_regularize=xent_regularize, run_opts=run_opts, @@ -761,7 +767,6 @@ def train(args, run_opts): chain_lib.compute_train_cv_probabilities( dir=args.dir, iter=num_iters, egs_dir=egs_dir, l2_regularize=l2_regularize, xent_regularize=xent_regularize, - leaky_hmm_coefficient=args.leaky_hmm_coefficient, run_opts=run_opts, use_multitask_egs=use_multitask_egs, objective_opts=objective_opts) diff --git a/src/chain/chain-denominator-smbr.cc b/src/chain/chain-denominator-smbr.cc index b96cc9388f7..c18154b1d9b 100644 --- a/src/chain/chain-denominator-smbr.cc +++ b/src/chain/chain-denominator-smbr.cc @@ -63,6 +63,14 @@ DenominatorSmbrComputation::DenominatorSmbrComputation( ok_(true) { KALDI_ASSERT(opts_.leaky_hmm_coefficient >= 0.0 && opts_.leaky_hmm_coefficient < 1.0); + + KALDI_ASSERT(opts_.smbr_leaky_hmm_coefficient < 1.0); + + if (opts_.smbr_leaky_hmm_coefficient < 0.0) + leaky_hmm_coefficient_ = opts_.leaky_hmm_coefficient; + else + leaky_hmm_coefficient_ = opts_.smbr_leaky_hmm_coefficient; + // make sure the alpha sums and beta sums are zeroed. alpha_.ColRange(den_graph_.NumStates() * num_sequences_, num_sequences_).SetZero(); @@ -230,7 +238,7 @@ void DenominatorSmbrComputation::AlphaSmbrDash(int32 t) { den_graph_.NumStates() * num_sequences_, num_sequences_); alpha_sum_vec.AddRowSumMat(1.0, alpha_mat, 0.0); - + CuSubVector alpha_smbr_sum_vec( this_alpha_smbr + den_graph_.NumStates() * num_sequences_, num_sequences_); @@ -238,14 +246,14 @@ void DenominatorSmbrComputation::AlphaSmbrDash(int32 t) { KALDI_ASSERT(alpha_sum_vec.Min() > 0); - alpha_smbr_mat.AddVecVec(opts_.leaky_hmm_coefficient, + alpha_smbr_mat.AddVecVec(leaky_hmm_coefficient_, den_graph_.InitialProbs(), alpha_smbr_sum_vec); - alpha_mat.AddVecVec(opts_.leaky_hmm_coefficient, + alpha_mat.AddVecVec(leaky_hmm_coefficient_, den_graph_.InitialProbs(), alpha_sum_vec); // it's now alpha-dash. - + alpha_smbr_mat.DivElements(alpha_mat); } @@ -268,19 +276,19 @@ void DenominatorSmbrComputation::BetaSmbr(int32 t) { beta_smbr_dash_mat.MulElements(beta_dash_mat); // making the t index implicit, the beta-dash-sum for each sequence is the sum - // over all states i of beta_i * opts_.leaky_hmm_coefficient * initial_prob_i. + // over all states i of beta_i * leaky_hmm_coefficient_ * initial_prob_i. CuSubVector beta_dash_sum_vec( this_beta_dash + den_graph_.NumStates() * num_sequences_, num_sequences_); - beta_dash_sum_vec.AddMatVec(opts_.leaky_hmm_coefficient, beta_dash_mat, + beta_dash_sum_vec.AddMatVec(leaky_hmm_coefficient_, beta_dash_mat, kTrans, den_graph_.InitialProbs(), 0.0); CuSubVector beta_smbr_dash_sum_vec( this_beta_smbr_dash + den_graph_.NumStates() * num_sequences_, num_sequences_); - beta_smbr_dash_sum_vec.AddMatVec(opts_.leaky_hmm_coefficient, + beta_smbr_dash_sum_vec.AddMatVec(leaky_hmm_coefficient_, beta_smbr_dash_mat, kTrans, den_graph_.InitialProbs(), 0.0); - + // we are computing beta in place. After the following, beta-dash-mat // will contain the actual beta (i.e. the counterpart of alpha), // not the beta-dash. @@ -344,13 +352,13 @@ BaseFloat DenominatorSmbrComputation::ComputeTotObjf(BaseFloat *aux_objf) { BaseFloat prob_sum = tot_prob_.Sum(); KALDI_ASSERT(prob_sum == prob_sum); - + // Take weighted-average of the SMBR quantitites over all the // HMM states for each sequence. last_alpha_smbr.MulElements(last_alpha_dash); tot_smbr_.AddRowSumMat(1.0, last_alpha_smbr, 0.0); tot_smbr_.DivElements(tot_prob_); - + if (aux_objf) *aux_objf = -opts_.mmi_factor * ( tot_log_prob + log_inv_arbitrary_scales_product); diff --git a/src/chain/chain-denominator-smbr.h b/src/chain/chain-denominator-smbr.h index ca526d68f2e..fce3114521d 100644 --- a/src/chain/chain-denominator-smbr.h +++ b/src/chain/chain-denominator-smbr.h @@ -345,6 +345,8 @@ class DenominatorSmbrComputation { CuVector log_correction_term_; bool ok_; + + BaseFloat leaky_hmm_coefficient_ = 1e-05; }; diff --git a/src/chain/chain-training.h b/src/chain/chain-training.h index 200786b9164..abba85e7de9 100644 --- a/src/chain/chain-training.h +++ b/src/chain/chain-training.h @@ -55,7 +55,6 @@ struct ChainTrainingOptions { // epsilon loops. BaseFloat leaky_hmm_coefficient; - // Cross-entropy regularization constant. (e.g. try 0.1). If nonzero, // the network is expected to have an output named 'output-xent', which // should have a softmax as its final nonlinearity. @@ -74,11 +73,17 @@ struct ChainTrainingOptions { bool norm_regularize; + BaseFloat smbr_leaky_hmm_coefficient; + + std::string smbr_factors_str, mmi_factors_str, ml_factors_str; + ChainTrainingOptions(): l2_regularize(0.0), leaky_hmm_coefficient(1.0e-05), xent_regularize(0.0), use_smbr_objective(false), exclude_silence(false), one_silence_class(false), - mmi_factor(1.0), ml_factor(0.0), smbr_factor(0.0), smbr_threshold(0.0), - norm_regularize(false) { } + mmi_factor(1.0), ml_factor(0.0), + smbr_factor(0.0), smbr_threshold(0.0), + norm_regularize(false), + smbr_leaky_hmm_coefficient(-1) { } void Register(OptionsItf *opts) { opts->Register("l2-regularize", &l2_regularize, "l2 regularization " @@ -124,6 +129,15 @@ struct ChainTrainingOptions { "if this options is true."); opts->Register("smbr-threshold", &smbr_threshold, "Posterior below this value is considered 0"); + opts->Register("smbr-factors", &smbr_factors_str, + "SMBR factors for each output"); + opts->Register("mmi-factors", &mmi_factors_str, + "MMI factors for each output"); + opts->Register("ml-factors", &ml_factors_str, + "ML factors for each output"); + opts->Register("smbr-leaky-hmm-coefficient", &smbr_leaky_hmm_coefficient, + "leaky-hmm-coefficient for LF-sMBR training. If not " + "provided, will use --leaky-hmm-coefficient instead."); } }; diff --git a/src/nnet3/nnet-chain-diagnostics.cc b/src/nnet3/nnet-chain-diagnostics.cc index 0e0fe03cd0c..2bbe1db5f97 100644 --- a/src/nnet3/nnet-chain-diagnostics.cc +++ b/src/nnet3/nnet-chain-diagnostics.cc @@ -87,22 +87,15 @@ NnetChainComputeProb::NnetChainComputeProb( sil_indices_.CopyFromVec(indices); } - if (!nnet_config.objective_scales_str.empty()) { - std::vector objectives_for_outputs; - SplitStringToVector(nnet_config.objective_scales_str, ",", false, - &objectives_for_outputs); - std::vector::const_iterator it = objectives_for_outputs.begin(); - for (; it != objectives_for_outputs.end(); ++it) { - std::vector this_output_objective; - SplitStringToVector(*it, ":", false, - &this_output_objective); - - BaseFloat scale; - ConvertStringToReal(this_output_objective[1], &scale); - objective_scales_.insert( - std::make_pair(this_output_objective[0], scale)); - } - } + if (!chain_config.smbr_factors_str.empty()) + ParseObjectiveScales(chain_config.smbr_factors_str, + &smbr_factors_); + if (!chain_config.mmi_factors_str.empty()) + ParseObjectiveScales(chain_config.mmi_factors_str, + &mmi_factors_); + if (!chain_config.ml_factors_str.empty()) + ParseObjectiveScales(chain_config.ml_factors_str, + &ml_factors_); } @@ -209,7 +202,26 @@ void NnetChainComputeProb::ProcessOutputs(const NnetChainExample &eg, KALDI_ERR << "Network has no output named " << sup.name; const CuMatrixBase &nnet_output = computer->GetOutput(sup.name); - bool use_xent = (chain_config_.xent_regularize != 0.0); + + chain::ChainTrainingOptions chain_config_copy(chain_config_); + + { + auto it = smbr_factors_.find(sup.name); + if (it != smbr_factors_.end()) + chain_config_copy.smbr_factor = it->second; + } + { + auto it = mmi_factors_.find(sup.name); + if (it != mmi_factors_.end()) + chain_config_copy.mmi_factor = it->second; + } + { + auto it = ml_factors_.find(sup.name); + if (it != ml_factors_.end()) + chain_config_copy.ml_factor = it->second; + } + + bool use_xent = (chain_config_copy.xent_regularize != 0.0); std::string xent_name = sup.name + "-xent"; // typically "output-xent". CuMatrix nnet_output_deriv, xent_deriv; if (nnet_config_.compute_deriv) @@ -219,38 +231,24 @@ void NnetChainComputeProb::ProcessOutputs(const NnetChainExample &eg, xent_deriv.Resize(nnet_output.NumRows(), nnet_output.NumCols(), kUndefined); + BaseFloat tot_like, tot_mmi_objf, tot_l2_term, tot_weight; - if (chain_config_.use_smbr_objective) + if (chain_config_copy.use_smbr_objective) ComputeChainSmbrObjfAndDeriv( - chain_config_, den_graph_, + chain_config_copy, den_graph_, sup.supervision, nnet_output, &tot_like, &tot_mmi_objf, &tot_l2_term, &tot_weight, (nnet_config_.compute_deriv ? &nnet_output_deriv : NULL), (use_xent ? &xent_deriv : NULL), sil_indices_.Dim() ? &sil_indices_ : NULL); else - ComputeChainObjfAndDeriv(chain_config_, den_graph_, + ComputeChainObjfAndDeriv(chain_config_copy, den_graph_, sup.supervision, nnet_output, &tot_like, &tot_l2_term, &tot_weight, (nnet_config_.compute_deriv ? &nnet_output_deriv : NULL), (use_xent ? &xent_deriv : NULL)); - BaseFloat objf_scale = 1.0; - { - unordered_map::iterator it = - objective_scales_.find(sup.name); - - if (it != objective_scales_.end()) { - objf_scale = it->second; - tot_like *= it->second; - tot_l2_term *= it->second; - tot_mmi_objf *= it->second; - tot_weight *= it->second; - if (nnet_config_.compute_deriv) - nnet_output_deriv.Scale(it->second); - } - } // note: in this context we don't want to apply 'sup.deriv_weights' because // this code is used only in combination, where it's part of an L-BFGS @@ -262,7 +260,7 @@ void NnetChainComputeProb::ProcessOutputs(const NnetChainExample &eg, std::vector aux_objfs; aux_objfs.push_back(tot_l2_term); - if (chain_config_.use_smbr_objective) + if (chain_config_copy.use_smbr_objective) aux_objfs.push_back(tot_mmi_objf); { @@ -270,12 +268,12 @@ void NnetChainComputeProb::ProcessOutputs(const NnetChainExample &eg, = objf_info_.find(sup.name); if (it == objf_info_.end()) { - BaseFloat this_objf_scale = objf_scale; - std::vector aux_objf_scales(1, objf_scale); // for l2 term - if (chain_config_.use_smbr_objective) { - this_objf_scale *= chain_config_.smbr_factor; + BaseFloat this_objf_scale = 1.0; + std::vector aux_objf_scales(1, 1.0); // l2_term + if (chain_config_copy.use_smbr_objective) { + this_objf_scale *= chain_config_copy.smbr_factor; aux_objf_scales.push_back( - objf_scale * (chain_config_.mmi_factor + chain_config_.ml_factor)); + (chain_config_copy.mmi_factor + chain_config_copy.ml_factor)); } ChainObjectiveInfo totals(this_objf_scale, aux_objf_scales); @@ -299,14 +297,7 @@ void NnetChainComputeProb::ProcessOutputs(const NnetChainExample &eg, // computation. note, xent_deriv has a factor of '.supervision.weight', // but so does tot_weight. BaseFloat xent_objf = TraceMatMat(xent_output, xent_deriv, kTrans); - unordered_map::iterator it = - objective_scales_.find(xent_name); - if (it != objective_scales_.end()) { - xent_objf *= it->second; - xent_deriv.Scale(it->second); - } - xent_totals.tot_weight += tot_weight; xent_totals.tot_like += xent_objf; } diff --git a/src/nnet3/nnet-chain-diagnostics.h b/src/nnet3/nnet-chain-diagnostics.h index ec45a9ad43d..ed265233636 100644 --- a/src/nnet3/nnet-chain-diagnostics.h +++ b/src/nnet3/nnet-chain-diagnostics.h @@ -116,7 +116,9 @@ class NnetChainComputeProb { CuArray sil_indices_; - unordered_map objective_scales_; + unordered_map smbr_factors_; + unordered_map mmi_factors_; + unordered_map ml_factors_; }; /// This function zeros the stored component-level stats in the nnet using diff --git a/src/nnet3/nnet-chain-training.cc b/src/nnet3/nnet-chain-training.cc index 5173e1f1670..546933a452d 100644 --- a/src/nnet3/nnet-chain-training.cc +++ b/src/nnet3/nnet-chain-training.cc @@ -151,18 +151,21 @@ class ChainTrainerMemoryHolder { public: ChainTrainerMemoryHolder(const Nnet &nnet, int32 num_den_graph_states, - const NnetChainExample &eg); + const NnetChainExample &eg, + bool use_smbr_objective = false); private: CuMatrix nnet_output_deriv_; CuMatrix xent_output_deriv_; CuMatrix beta_; CuMatrix alpha_; - + CuMatrix beta_smbr_; + CuMatrix alpha_smbr_; }; ChainTrainerMemoryHolder::ChainTrainerMemoryHolder(const Nnet &nnet, int32 den_graph_states, - const NnetChainExample &eg) { + const NnetChainExample &eg, + bool use_smbr_objective) { std::vector::const_iterator iter = eg.outputs.begin(), end = eg.outputs.end(); @@ -206,7 +209,6 @@ ChainTrainerMemoryHolder::ChainTrainerMemoryHolder(const Nnet &nnet, max_sequence_size, kUndefined); - nnet_output_deriv_.Resize(max_rows, max_cols, kUndefined); // note: the same block of memory can be used for xent_output_deriv_ as is // used for exp_nnet_output_transposed_ in chain-training.cc. @@ -214,6 +216,13 @@ ChainTrainerMemoryHolder::ChainTrainerMemoryHolder(const Nnet &nnet, kUndefined, kStrideEqualNumCols); beta_.Resize(2, max_sequence_size, kUndefined); + + if (use_smbr_objective) { + alpha_smbr_.Resize(max_frames_per_sequence, + max_sequence_size, + kUndefined); + beta_smbr_.Resize(2, max_sequence_size, kUndefined); + } } void NnetChainTrainer::TrainInternal(const NnetChainExample &eg, @@ -228,7 +237,8 @@ void NnetChainTrainer::TrainInternal(const NnetChainExample &eg, // reserve the memory needed in ProcessOutputs (before memory gets fragmented // by the call to computer.Run(). ChainTrainerMemoryHolder *memory_holder = - new ChainTrainerMemoryHolder(*nnet_, den_graph_.NumStates(), eg); + new ChainTrainerMemoryHolder(*nnet_, den_graph_.NumStates(), eg, + opts_.chain_config.use_smbr_objective); // give the inputs to the computer object. computer.AcceptInputs(*nnet_, eg.inputs); diff --git a/src/nnet3/nnet-chain-training.h b/src/nnet3/nnet-chain-training.h index 9fe73f9c726..de079bb327d 100644 --- a/src/nnet3/nnet-chain-training.h +++ b/src/nnet3/nnet-chain-training.h @@ -117,7 +117,9 @@ class NnetChainTrainer { CuArray sil_indices_; - unordered_map objective_scales_; + unordered_map smbr_factors_; + unordered_map mmi_factors_; + unordered_map ml_factors_; }; From 4cb144cfd26d7dc8c590085f1054d857008813f4 Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Wed, 21 Mar 2018 13:09:11 -0400 Subject: [PATCH 139/174] adding kl factors --- .../multi_condition/run_ivector_common.sh | 2 - egs/wsj/s5/steps/nnet3/chain/train.py | 19 ++ egs/wsj/s5/steps/nnet3/chain/train_ts.py | 174 ++++++++++++------ src/chain/chain-supervision.cc | 18 +- src/chain/chain-training.cc | 19 +- src/chain/chain-training.h | 7 +- src/nnet3/nnet-chain-diagnostics.cc | 48 ++--- src/nnet3/nnet-chain-diagnostics.h | 1 + src/nnet3/nnet-chain-example.cc | 9 + src/nnet3/nnet-chain-training.cc | 47 ++--- src/nnet3/nnet-chain-training.h | 1 + 11 files changed, 239 insertions(+), 106 deletions(-) diff --git a/egs/ami/s5b/local/nnet3/multi_condition/run_ivector_common.sh b/egs/ami/s5b/local/nnet3/multi_condition/run_ivector_common.sh index 69f655d9039..2b16ad70e89 100755 --- a/egs/ami/s5b/local/nnet3/multi_condition/run_ivector_common.sh +++ b/egs/ami/s5b/local/nnet3/multi_condition/run_ivector_common.sh @@ -117,8 +117,6 @@ if [ $stage -le 3 ]; then utils/combine_data.sh data/${mic}/${train_set}_sp_rvb_hires data/${mic}/${train_set}_sp_hires ${norvb_datadir}_rvb${num_data_reps}_hires fi -exit 1 - if [ $stage -le 4 ]; then steps/online/nnet2/get_pca_transform.sh --cmd "$train_cmd" \ --splice-opts "--left-context=3 --right-context=3" \ diff --git a/egs/wsj/s5/steps/nnet3/chain/train.py b/egs/wsj/s5/steps/nnet3/chain/train.py index eecf6448479..b525493945b 100755 --- a/egs/wsj/s5/steps/nnet3/chain/train.py +++ b/egs/wsj/s5/steps/nnet3/chain/train.py @@ -132,6 +132,10 @@ def get_args(): dest='ml_factor_schedule', default=None, action=common_lib.NullstrToNoneAction, help="Schedule for ML factor in LF-SMBR training.") + parser.add_argument("--chain.kl-factor-schedule", type=str, + dest='kl_factor_schedule', default=None, + action=common_lib.NullstrToNoneAction, + help="Schedule for KL factor in LF-SMBR training.") parser.add_argument("--chain.smbr-xent-regularize", default=None, dest='smbr_xent_regularize', type=float, help="Xent regularizer term used with sMBR training") @@ -618,6 +622,13 @@ def train(args, run_opts): objective_opts += " --ml-factors='{0}'".format(ml_factors) + if args.kl_factor_schedule is not None: + kl_factors = common_train_lib.get_schedule_string( + args.kl_factor_schedule, + float(num_archives_processed) / num_archives_to_process) + + objective_opts += " --kl-factors='{0}'".format(kl_factors) + objective_opts += " --norm-regularize={0}".format( "true" if args.norm_regularize else "false") @@ -738,6 +749,14 @@ def train(args, run_opts): objective_opts += " --ml-factors='{0}'".format(ml_factors) + if args.kl_factor_schedule is not None: + kl_factors = common_train_lib.get_schedule_string( + args.kl_factor_schedule, + float(num_archives_processed) / num_archives_to_process) + + objective_opts += " --kl-factors='{0}'".format(kl_factors) + + objective_opts += " --norm-regularize={0}".format( "true" if args.norm_regularize else "false") diff --git a/egs/wsj/s5/steps/nnet3/chain/train_ts.py b/egs/wsj/s5/steps/nnet3/chain/train_ts.py index ec86b873195..4ca3f314154 100755 --- a/egs/wsj/s5/steps/nnet3/chain/train_ts.py +++ b/egs/wsj/s5/steps/nnet3/chain/train_ts.py @@ -53,7 +53,7 @@ def get_args(): # egs extraction options parser.add_argument("--egs.chunk-width", type=str, dest='chunk_width', - default="20", + default=None, action=common_lib.NullstrToNoneAction, help="""Number of frames per chunk in the examples used to train the RNN. Caution: if you double this you should halve --trainer.samples-per-iter. May be @@ -86,9 +86,6 @@ def get_args(): parser.add_argument("--chain.leaky-hmm-coefficient", type=float, dest='leaky_hmm_coefficient', default=0.00001, help="") - parser.add_argument("--chain.smbr-leaky-hmm-coefficient", type=float, - dest='smbr_leaky_hmm_coefficient', default=0.00001, - help="") parser.add_argument("--chain.apply-deriv-weights", type=str, dest='apply_deriv_weights', default=True, action=common_lib.StrToBoolAction, @@ -119,12 +116,23 @@ def get_args(): dest='mmi_factor_schedule', default=None, action=common_lib.NullstrToNoneAction, help="Schedule for MMI factor in LF-SMBR training.") + parser.add_argument("--chain.ml-factor-schedule", type=str, + dest='ml_factor_schedule', default=None, + action=common_lib.NullstrToNoneAction, + help="Schedule for ML factor in LF-SMBR training.") + parser.add_argument("--chain.kl-factor-schedule", type=str, + dest='kl_factor_schedule', default=None, + action=common_lib.NullstrToNoneAction, + help="Schedule for KL factor in LF-SMBR training.") parser.add_argument("--chain.smbr-xent-regularize", default=None, dest='smbr_xent_regularize', type=float, help="Xent regularizer term used with sMBR training") parser.add_argument("--chain.smbr-l2-regularize", default=None, dest='smbr_l2_regularize', type=float, help="L2 regularizer term used with sMBR training") + parser.add_argument("--chain.smbr-leaky-hmm-coefficient", type=float, + dest='smbr_leaky_hmm_coefficient', default=None, + help="") # trainer options parser.add_argument("--trainer.input-model", type=str, @@ -224,7 +232,8 @@ def process_args(args): """ Process the options got from get_args() """ - if not common_train_lib.validate_chunk_width(args.chunk_width): + if (args.chunk_width is not None and + not common_train_lib.validate_chunk_width(args.chunk_width)): raise Exception("--egs.chunk-width has an invalid value") if not common_train_lib.validate_minibatch_size_str(args.num_chunk_per_minibatch): @@ -245,14 +254,12 @@ def process_args(args): "--trainer.deriv-truncate-margin.".format( args.deriv_truncate_margin)) - if (not os.path.exists(args.dir) - or (not os.path.exists(args.dir+"/configs") and - (args.input_model is None or not os.path.exists(args.input_model)))): - raise Exception("This script expects {0} to exist. Also either " - "--trainer.input-model option as initial 'raw' model " - "(used as 0.raw in the script) should be supplied or " - "{0}/configs directory which is the output of " - "make_configs.py script should be provided." + if (not os.path.exists(args.dir)): + raise Exception("This script expects --dir={0} to exist.") + if (not os.path.exists(args.dir+"/configs") and + (args.input_model is None or not os.path.exists(args.input_model))): + raise Exception("Either --trainer.input-model option should be supplied, " + "and exist; or the {0}/configs directory should exist." "".format(args.dir)) if args.transform_dir is None: @@ -270,6 +277,7 @@ def process_args(args): run_opts.train_queue_opt = "--gpu 1" run_opts.parallel_train_opts = "" run_opts.combine_queue_opt = "--gpu 1" + run_opts.combine_gpu_opt = "" else: logger.warning("Without using a GPU this will be very slow. " @@ -278,6 +286,7 @@ def process_args(args): run_opts.train_queue_opt = "" run_opts.parallel_train_opts = "--use-gpu=no" run_opts.combine_queue_opt = "" + run_opts.combine_gpu_opt = "--use-gpu=no" run_opts.command = args.command run_opts.egs_command = (args.egs_command @@ -334,6 +343,10 @@ def train(args, run_opts): args.lat_dir if args.egs_dir is None else None) + # Copy phones.txt from tree-dir to dir. Later, steps/nnet3/decode.sh will + # use it to check compatibility between training and decoding phone-sets. + shutil.copy('{0}/phones.txt'.format(args.tree_dir), args.dir) + # Set some variables. num_jobs = common_lib.get_number_of_jobs(args.tree_dir) feat_dim = common_lib.get_feat_dim(args.feat_dir) @@ -397,21 +410,21 @@ def train(args, run_opts): {dir}/init.raw""".format(command=run_opts.command, dir=args.dir)) - egs_left_context = left_context + args.frame_subsampling_factor / 2 - egs_right_context = right_context + args.frame_subsampling_factor / 2 + egs_left_context = left_context + args.frame_subsampling_factor // 2 + egs_right_context = right_context + args.frame_subsampling_factor // 2 # note: the '+ args.frame_subsampling_factor / 2' is to allow for the # fact that we'll be shifting the data slightly during training to give # variety to the training data. egs_left_context_initial = (left_context_initial + - args.frame_subsampling_factor / 2 if + args.frame_subsampling_factor // 2 if left_context_initial >= 0 else -1) egs_right_context_final = (right_context_final + - args.frame_subsampling_factor / 2 if + args.frame_subsampling_factor // 2 if right_context_final >= 0 else -1) default_egs_dir = '{0}/egs'.format(args.dir) if ((args.stage <= -3) and args.egs_dir is None): - logger.info("Generating egs using {0}".format("steps/nnet3/chain/get_egs_ts.sh")) + logger.info("Generating egs using get_egs_ts.sh") if (not os.path.exists("{0}/den.fst".format(args.dir)) or not os.path.exists("{0}/normalization.fst".format(args.dir)) or not os.path.exists("{0}/tree".format(args.dir))): @@ -428,7 +441,8 @@ def train(args, run_opts): right_context_final=egs_right_context_final, run_opts=run_opts, frame_subsampling_factor=args.frame_subsampling_factor, - frames_per_eg_str=args.chunk_width, + frames_per_eg_str=(args.chunk_width if args.chunk_width is not None + else ""), srand=args.srand, egs_opts=args.egs_opts, cmvn_opts=args.cmvn_opts, @@ -449,7 +463,7 @@ def train(args, run_opts): egs_left_context, egs_right_context, egs_left_context_initial, egs_right_context_final)) - assert(args.chunk_width == frames_per_eg_str) + assert(args.chunk_width is None or args.chunk_width == frames_per_eg_str) num_archives_expanded = num_archives * args.frame_subsampling_factor if (args.num_jobs_final > num_archives_expanded): @@ -461,9 +475,9 @@ def train(args, run_opts): logger.info("Copying the properties from {0} to {1}".format(egs_dir, args.dir)) common_train_lib.copy_egs_properties_to_exp_dir(egs_dir, args.dir) - if not os.path.exists('{0}/valid_diagnostic.egs'.format(egs_dir)): + if not os.path.exists('{0}/valid_diagnostic.cegs'.format(egs_dir)): if (not os.path.exists('{0}/valid_diagnostic.scp'.format(egs_dir))): - raise Exception('neither {0}/valid_diagnostic.egs nor ' + raise Exception('neither {0}/valid_diagnostic.cegs nor ' '{0}/valid_diagnostic.scp exist.' 'This script expects one of them.'.format(egs_dir)) use_multitask_egs = True @@ -495,7 +509,7 @@ def train(args, run_opts): num_archives_to_process = int(args.num_epochs * num_archives_expanded) num_archives_processed = 0 num_iters = ((num_archives_to_process * 2) - / (args.num_jobs_initial + args.num_jobs_final)) + // (args.num_jobs_initial + args.num_jobs_final)) # If do_final_combination is True, compute the set of models_to_combine. # Otherwise, models_to_combine will be none. @@ -550,18 +564,22 @@ def train(args, run_opts): xent_regularize = args.xent_regularize l2_regularize = args.l2_regularize - objective_opts = ("--objective-scales=" + args.objective_scales - if args.objective_scales is not None else "") - smbr_factor = 0.0 + objective_opts = "" + + use_smbr_objective = False if args.smbr_factor_schedule is not None: - smbr_factor = common_train_lib.get_schedule_value( + smbr_factors = common_train_lib.get_schedule_string( args.smbr_factor_schedule, float(num_archives_processed) / num_archives_to_process) - objective_opts += " --smbr-factor={0}".format(smbr_factor) + objective_opts += " --smbr-factors='{0}'".format(smbr_factors) + for factor in smbr_factors.split(): + parts = factor.split(":") + if parts[1] > 0.0: + use_smbr_objective = True + break - if smbr_factor > 0.0: - use_smbr=True + if use_smbr_objective: xent_regularize = (args.smbr_xent_regularize if args.smbr_xent_regularize is not None else args.xent_regularize) @@ -575,11 +593,29 @@ def train(args, run_opts): objective_opts += " " + args.smbr_extra_opts if args.mmi_factor_schedule is not None: - mmi_factor = common_train_lib.get_schedule_value( + mmi_factors = common_train_lib.get_schedule_string( args.mmi_factor_schedule, float(num_archives_processed) / num_archives_to_process) - objective_opts += " --mmi-factor={0}".format(mmi_factor) + objective_opts += " --mmi-factors='{0}'".format(mmi_factors) + else: + objective_opts += " --mmi-factors='output:0'" + + if args.ml_factor_schedule is not None: + ml_factors = common_train_lib.get_schedule_string( + args.ml_factor_schedule, + float(num_archives_processed) / num_archives_to_process) + + objective_opts += " --ml-factors='{0}'".format(ml_factors) + + if args.kl_factor_schedule is not None: + kl_factors = common_train_lib.get_schedule_string( + args.kl_factor_schedule, + float(num_archives_processed) / num_archives_to_process) + + objective_opts += " --kl-factors='{0}'".format(kl_factors) + else: + objective_opts += " --kl-factors='output:1'" objective_opts += " --norm-regularize={0}".format( "true" if args.norm_regularize else "false") @@ -597,6 +633,11 @@ def train(args, run_opts): percent, lrate, shrink_info_str)) + objective_opts += " --leaky-hmm-coefficient={0} {1}".format( + args.leaky_hmm_coefficient, + "" if args.smbr_leaky_hmm_coefficient is None else + "--smbr-leaky-hmm-coefficient={}".format(args.smbr_leaky_hmm_coefficient)) + chain_lib.train_one_iteration( dir=args.dir, iter=iter, @@ -610,6 +651,7 @@ def train(args, run_opts): args.dropout_schedule, float(num_archives_processed) / num_archives_to_process, iter), + train_opts=' '.join(args.train_opts), shrinkage_value=shrinkage_value, num_chunk_per_minibatch_str=args.num_chunk_per_minibatch, apply_deriv_weights=args.apply_deriv_weights, @@ -617,9 +659,6 @@ def train(args, run_opts): max_deriv_time_relative=max_deriv_time_relative, l2_regularize=l2_regularize, xent_regularize=xent_regularize, - leaky_hmm_coefficient=(args.smbr_leaky_hmm_coefficient - if smbr_factor > 0.0 - else args.leaky_hmm_coefficient), momentum=args.momentum, max_param_change=args.max_param_change, shuffle_buffer_size=args.shuffle_buffer_size, @@ -632,7 +671,7 @@ def train(args, run_opts): objective_opts=objective_opts) if args.cleanup: - # do a clean up everythin but the last 2 models, under certain + # do a clean up everything but the last 2 models, under certain # conditions common_train_lib.remove_model( args.dir, iter-2, num_iters, models_to_combine, @@ -657,30 +696,62 @@ def train(args, run_opts): l2_regularize = args.l2_regularize objective_opts = ("--objective-scales=" + args.objective_scales if args.objective_scales is not None else "") - smbr_factor = 0.0 - if args.smbr_factor_schedule is not None: - smbr_factor = common_train_lib.get_schedule_value( - args.smbr_factor_schedule, 1.0) - objective_opts += " --smbr-factor={0}".format(smbr_factor) - - if smbr_factor > 0.0: - use_smbr=True + use_smbr_objective = False + if args.smbr_factor_schedule is not None: + smbr_factors = common_train_lib.get_schedule_string( + args.smbr_factor_schedule, + float(num_archives_processed) / num_archives_to_process) + + objective_opts += " --smbr-factors='{0}'".format(smbr_factors) + for factor in smbr_factors.split(): + parts = factor.split(":") + if parts[1] > 0.0: + use_smbr_objective = True + break + + if use_smbr_objective: xent_regularize = (args.smbr_xent_regularize if args.smbr_xent_regularize is not None else args.xent_regularize) l2_regularize = (args.smbr_l2_regularize if args.smbr_l2_regularize is not None else args.l2_regularize) - objective_opts = "--use-smbr-objective" + objective_opts += " --use-smbr-objective" if silence_pdfs is not None: objective_opts += " --silence-pdfs=" + silence_pdfs + if args.smbr_extra_opts is not None: + objective_opts += " " + args.smbr_extra_opts if args.mmi_factor_schedule is not None: - mmi_factor = common_train_lib.get_schedule_value( - args.mmi_factor_schedule, 1.0) + mmi_factors = common_train_lib.get_schedule_string( + args.mmi_factor_schedule, + float(num_archives_processed) / num_archives_to_process) + + objective_opts += " --mmi-factors='{0}'".format(mmi_factors) + + if args.ml_factor_schedule is not None: + ml_factors = common_train_lib.get_schedule_string( + args.ml_factor_schedule, + float(num_archives_processed) / num_archives_to_process) + + objective_opts += " --ml-factors='{0}'".format(ml_factors) + + if args.kl_factor_schedule is not None: + kl_factors = common_train_lib.get_schedule_string( + args.kl_factor_schedule, + float(num_archives_processed) / num_archives_to_process) + + objective_opts += " --kl-factors='{0}'".format(kl_factors) + + + objective_opts += " --norm-regularize={0}".format( + "true" if args.norm_regularize else "false") - objective_opts += " --mmi-factor={0}".format(mmi_factor) + objective_opts += " --leaky-hmm-coefficient={0} {1}".format( + args.leaky_hmm_coefficient, + "" if args.smbr_leaky_hmm_coefficient is None else + "--smbr-leaky-hmm-coefficient={}".format(args.smbr_leaky_hmm_coefficient)) if args.do_final_combination: logger.info("Doing final combination to produce final.mdl") @@ -690,7 +761,6 @@ def train(args, run_opts): models_to_combine=models_to_combine, num_chunk_per_minibatch_str=args.num_chunk_per_minibatch, egs_dir=egs_dir, - leaky_hmm_coefficient=args.leaky_hmm_coefficient, l2_regularize=l2_regularize, xent_regularize=xent_regularize, run_opts=run_opts, @@ -704,7 +774,6 @@ def train(args, run_opts): chain_lib.compute_train_cv_probabilities( dir=args.dir, iter=num_iters, egs_dir=egs_dir, l2_regularize=l2_regularize, xent_regularize=xent_regularize, - leaky_hmm_coefficient=args.leaky_hmm_coefficient, run_opts=run_opts, use_multitask_egs=use_multitask_egs, objective_opts=objective_opts) @@ -722,8 +791,9 @@ def train(args, run_opts): # delete it remove_egs = False + # leave the last-two-numbered models, for diagnostic reasons. common_train_lib.clean_nnet_dir( - args.dir, num_iters, egs_dir, + args.dir, num_iters - 1, egs_dir, preserve_model_interval=args.preserve_model_interval, remove_egs=remove_egs) @@ -737,7 +807,7 @@ def train(args, run_opts): with open("{dir}/accuracy.report".format(dir=args.dir), "w") as f: f.write(report) - common_lib.execute_command("steps/info/nnet3_dir_info.pl " + common_lib.execute_command("steps/info/chain_dir_info.pl " "{0}".format(args.dir)) diff --git a/src/chain/chain-supervision.cc b/src/chain/chain-supervision.cc index 192aef992f0..8a9fe62465f 100644 --- a/src/chain/chain-supervision.cc +++ b/src/chain/chain-supervision.cc @@ -946,8 +946,12 @@ void AppendSupervisionPost(const std::vector &input, } AppendGeneralMatrixRows( - output_targets, &((*output_supervision)[0].numerator_post_targets), + output_targets, &((*output_supervision)[0].numerator_post_targets), true); // sort by t + KALDI_ASSERT((*output_supervision)[0].numerator_post_targets.NumRows() + == (*output_supervision)[0].frames_per_sequence + * (*output_supervision)[0].num_sequences); + KALDI_ASSERT((*output_supervision)[0].frames_per_sequence * (*output_supervision)[0].num_sequences == (*output_supervision)[0].numerator_post_targets.NumRows()); } void AppendSupervision(const std::vector &input, @@ -992,20 +996,26 @@ void AppendSupervision(const std::vector &input, } } + KALDI_ASSERT(output_supervision->size() == 1); // otherwise not supported KALDI_ASSERT(output_was_merged.size() == output_supervision->size()); for (size_t i = 0; i < output_supervision->size(); i++) { if (output_was_merged[i]) { fst::StdVectorFst &out_fst = (*output_supervision)[i].fst; // The process of concatenation will have introduced epsilons. fst::RmEpsilon(&out_fst); - if (input[0]->numerator_post_targets.NumRows() > 0 && out_fst.Start() < 0) - return; - SortBreadthFirstSearch(&out_fst); + if (input[0]->numerator_post_targets.NumRows() > 0 && out_fst.Start() >= 0) + SortBreadthFirstSearch(&out_fst); } } if (input[0]->numerator_post_targets.NumRows() > 0) { + KALDI_LOG << "Appending numerator post "; AppendSupervisionPost(input, output_supervision); + KALDI_LOG << (*output_supervision)[0].frames_per_sequence << " * " + << (*output_supervision)[0].num_sequences << " == " + << (*output_supervision)[0].numerator_post_targets.NumRows(); + + KALDI_ASSERT((*output_supervision)[0].frames_per_sequence * (*output_supervision)[0].num_sequences == (*output_supervision)[0].numerator_post_targets.NumRows()); } } diff --git a/src/chain/chain-training.cc b/src/chain/chain-training.cc index 67ce91abe1d..92fbaf7b3a7 100644 --- a/src/chain/chain-training.cc +++ b/src/chain/chain-training.cc @@ -262,9 +262,10 @@ void ComputeChainObjfAndDeriv(const ChainTrainingOptions &opts, supervision.num_sequences, nnet_output); - den_logprob_weighted = supervision.weight * denominator.Forward(); + den_logprob_weighted = supervision.weight * + (opts.mmi_factor + opts.kl_factor) * denominator.Forward(); if (nnet_output_deriv) - ok = denominator.Backward(-supervision.weight, + ok = denominator.Backward(-supervision.weight * (opts.mmi_factor + opts.kl_factor), nnet_output_deriv); } @@ -278,8 +279,20 @@ void ComputeChainObjfAndDeriv(const ChainTrainingOptions &opts, kSetZero, kStrideEqualNumCols); } + if (opts.kl_factor > 0.0) { + if (xent_output_deriv) { + supervision.numerator_post_targets.CopyToMat(xent_output_deriv); + xent_output_deriv->Scale(supervision.weight * opts.kl_factor); + if (nnet_output_deriv) + nnet_output_deriv->AddMat(1.0, *xent_output_deriv); + } else if (nnet_output_deriv) { + CuMatrix numerator_post(nnet_output.NumRows(), nnet_output.NumCols()); + supervision.numerator_post_targets.CopyToMat(&numerator_post); + nnet_output_deriv->AddMat(supervision.weight * opts.kl_factor, numerator_post); + } + } - { + if (opts.mmi_factor > 0.0) { NumeratorComputation numerator(supervision, nnet_output); // note: supervision.weight is included as a factor in the derivative from // the numerator object, as well as the returned logprob. diff --git a/src/chain/chain-training.h b/src/chain/chain-training.h index 0fea8243269..8367b41bf3d 100644 --- a/src/chain/chain-training.h +++ b/src/chain/chain-training.h @@ -68,6 +68,7 @@ struct ChainTrainingOptions { BaseFloat mmi_factor; BaseFloat ml_factor; + BaseFloat kl_factor; BaseFloat smbr_factor; BaseFloat smbr_threshold; @@ -75,12 +76,12 @@ struct ChainTrainingOptions { BaseFloat smbr_leaky_hmm_coefficient; - std::string smbr_factors_str, mmi_factors_str, ml_factors_str; + std::string smbr_factors_str, mmi_factors_str, ml_factors_str, kl_factors_str; ChainTrainingOptions(): l2_regularize(0.0), leaky_hmm_coefficient(1.0e-05), xent_regularize(0.0), use_smbr_objective(false), exclude_silence(false), one_silence_class(false), - mmi_factor(1.0), ml_factor(0.0), + mmi_factor(1.0), ml_factor(0.0), kl_factor(0.0), smbr_factor(0.0), smbr_threshold(0.0), norm_regularize(false), smbr_leaky_hmm_coefficient(-1) { } @@ -135,6 +136,8 @@ struct ChainTrainingOptions { "MMI factors for each output"); opts->Register("ml-factors", &ml_factors_str, "ML factors for each output"); + opts->Register("kl-factors", &kl_factors_str, + "KL factors for each output"); opts->Register("smbr-leaky-hmm-coefficient", &smbr_leaky_hmm_coefficient, "leaky-hmm-coefficient for LF-sMBR training. If not " "provided, will use --leaky-hmm-coefficient instead."); diff --git a/src/nnet3/nnet-chain-diagnostics.cc b/src/nnet3/nnet-chain-diagnostics.cc index ec02b445f84..3cb1f317e1b 100644 --- a/src/nnet3/nnet-chain-diagnostics.cc +++ b/src/nnet3/nnet-chain-diagnostics.cc @@ -96,6 +96,9 @@ NnetChainComputeProb::NnetChainComputeProb( if (!chain_config.ml_factors_str.empty()) ParseObjectiveScales(chain_config.ml_factors_str, &ml_factors_); + if (!chain_config.kl_factors_str.empty()) + ParseObjectiveScales(chain_config.kl_factors_str, + &kl_factors_); } @@ -220,6 +223,11 @@ void NnetChainComputeProb::ProcessOutputs(const NnetChainExample &eg, if (it != ml_factors_.end()) chain_config_copy.ml_factor = it->second; } + { + auto it = kl_factors_.find(sup.name); + if (it != kl_factors_.end()) + chain_config_copy.kl_factor = it->second; + } bool use_xent = (chain_config_copy.xent_regularize != 0.0); std::string xent_name = sup.name + "-xent"; // typically "output-xent". @@ -234,28 +242,24 @@ void NnetChainComputeProb::ProcessOutputs(const NnetChainExample &eg, BaseFloat tot_like, tot_mmi_objf, tot_l2_term, tot_weight; - if (sup.supervision.numerator_post_targets.NumRows() > 0) { - ComputeKLObjfAndDeriv(chain_config_copy, den_graph_, - sup.supervision, nnet_output, - &tot_like, &tot_l2_term, &tot_weight, - (nnet_config_.compute_deriv ? &nnet_output_deriv : - NULL), (use_xent ? &xent_deriv : NULL)); - } else { - if (chain_config_copy.use_smbr_objective) - ComputeChainSmbrObjfAndDeriv( - chain_config_copy, den_graph_, - sup.supervision, nnet_output, - &tot_like, &tot_mmi_objf, &tot_l2_term, &tot_weight, - (nnet_config_.compute_deriv ? &nnet_output_deriv : - NULL), (use_xent ? &xent_deriv : NULL), - sil_indices_.Dim() ? &sil_indices_ : NULL); - else - ComputeChainObjfAndDeriv(chain_config_copy, den_graph_, - sup.supervision, nnet_output, - &tot_like, &tot_l2_term, &tot_weight, - (nnet_config_.compute_deriv ? &nnet_output_deriv : - NULL), (use_xent ? &xent_deriv : NULL)); - } + if (chain_config_copy.kl_factor > 0.0) + KALDI_ASSERT(sup.supervision.numerator_post_targets.NumRows() > 0 + && chain_config_copy.smbr_factor == 0.0); + + if (chain_config_copy.smbr_factor > 0.0) + ComputeChainSmbrObjfAndDeriv( + chain_config_copy, den_graph_, + sup.supervision, nnet_output, + &tot_like, &tot_mmi_objf, &tot_l2_term, &tot_weight, + (nnet_config_.compute_deriv ? &nnet_output_deriv : + NULL), (use_xent ? &xent_deriv : NULL), + sil_indices_.Dim() ? &sil_indices_ : NULL); + else + ComputeChainObjfAndDeriv(chain_config_copy, den_graph_, + sup.supervision, nnet_output, + &tot_like, &tot_l2_term, &tot_weight, + (nnet_config_.compute_deriv ? &nnet_output_deriv : + NULL), (use_xent ? &xent_deriv : NULL)); // note: in this context we don't want to apply 'sup.deriv_weights' because // this code is used only in combination, where it's part of an L-BFGS diff --git a/src/nnet3/nnet-chain-diagnostics.h b/src/nnet3/nnet-chain-diagnostics.h index 671fd731832..916fc8ab981 100644 --- a/src/nnet3/nnet-chain-diagnostics.h +++ b/src/nnet3/nnet-chain-diagnostics.h @@ -125,6 +125,7 @@ class NnetChainComputeProb { unordered_map smbr_factors_; unordered_map mmi_factors_; unordered_map ml_factors_; + unordered_map kl_factors_; }; /// This function zeros the stored component-level stats in the nnet using diff --git a/src/nnet3/nnet-chain-example.cc b/src/nnet3/nnet-chain-example.cc index c4e7f900c3b..2d3e87627ee 100644 --- a/src/nnet3/nnet-chain-example.cc +++ b/src/nnet3/nnet-chain-example.cc @@ -87,6 +87,8 @@ void NnetChainSupervision::CheckDim() const { KALDI_ASSERT(deriv_weights.Dim() == indexes.size()); KALDI_ASSERT(deriv_weights.Min() >= 0.0); } + if (supervision.numerator_post_targets.NumRows() > 0) + KALDI_ASSERT(indexes.size() == supervision.numerator_post_targets.NumRows()); } NnetChainSupervision::NnetChainSupervision(const NnetChainSupervision &other): @@ -211,11 +213,18 @@ static void MergeSupervision( AppendSupervision(input_supervision, compactify, &output_supervision); + + if (output_supervision[0].numerator_post_targets.NumRows() > 0) + KALDI_ASSERT(output_supervision[0].frames_per_sequence * output_supervision[0].num_sequences == output_supervision[0].numerator_post_targets.NumRows()); + if (output_supervision.size() != 1) KALDI_ERR << "Failed to merge 'chain' examples-- inconsistent lengths " << "or weights?"; output->supervision.Swap(&(output_supervision[0])); + if (output->supervision.numerator_post_targets.NumRows() > 0) + KALDI_ASSERT(output->supervision.frames_per_sequence * output->supervision.num_sequences == output->supervision.numerator_post_targets.NumRows()); + output->indexes.clear(); output->indexes.reserve(num_indexes); for (int32 n = 0; n < num_inputs; n++) { diff --git a/src/nnet3/nnet-chain-training.cc b/src/nnet3/nnet-chain-training.cc index 5f8a2e08691..a88c1fb5132 100644 --- a/src/nnet3/nnet-chain-training.cc +++ b/src/nnet3/nnet-chain-training.cc @@ -109,6 +109,9 @@ NnetChainTrainer::NnetChainTrainer(const NnetChainTrainingOptions &opts, if (!opts.chain_config.ml_factors_str.empty()) ParseObjectiveScales(opts.chain_config.ml_factors_str, &ml_factors_); + if (!opts.chain_config.kl_factors_str.empty()) + ParseObjectiveScales(opts.chain_config.kl_factors_str, + &kl_factors_); } void NnetChainTrainer::Train(const NnetChainExample &chain_eg) { @@ -616,6 +619,11 @@ void NnetChainTrainer::ProcessOutputs(bool is_backstitch_step2, if (it != ml_factors_.end()) chain_config.ml_factor = it->second; } + { + auto it = kl_factors_.find(sup.name); + if (it != kl_factors_.end()) + chain_config.kl_factor = it->second; + } bool use_xent = (chain_config.xent_regularize != 0.0); std::string xent_name = sup.name + "-xent"; // typically "output-xent". @@ -623,29 +631,26 @@ void NnetChainTrainer::ProcessOutputs(bool is_backstitch_step2, BaseFloat tot_objf, tot_mmi_objf, tot_l2_term, tot_weight; - if (sup.supervision.numerator_post_targets.NumRows() > 0) { - ComputeKLObjfAndDeriv(opts_.chain_config, den_graph_, - sup.supervision, nnet_output, - &tot_objf, &tot_l2_term, &tot_weight, - &nnet_output_deriv, - (use_xent ? &xent_deriv : NULL)); + if (chain_config.kl_factor > 0.0) + KALDI_ASSERT(sup.supervision.numerator_post_targets.NumRows() > 0 + && chain_config.smbr_factor == 0.0); + + if (chain_config.smbr_factor > 0.0) { + ComputeChainSmbrObjfAndDeriv(chain_config, den_graph_, + sup.supervision, nnet_output, + &tot_objf, &tot_mmi_objf, + &tot_l2_term, &tot_weight, + &nnet_output_deriv, + (use_xent ? &xent_deriv : NULL), + sil_indices_.Dim() ? &sil_indices_ : NULL); } else { - if (chain_config.use_smbr_objective) { - ComputeChainSmbrObjfAndDeriv(chain_config, den_graph_, - sup.supervision, nnet_output, - &tot_objf, &tot_mmi_objf, - &tot_l2_term, &tot_weight, - &nnet_output_deriv, - (use_xent ? &xent_deriv : NULL), - sil_indices_.Dim() ? &sil_indices_ : NULL); - } else { - ComputeChainObjfAndDeriv(chain_config, den_graph_, - sup.supervision, nnet_output, - &tot_objf, &tot_l2_term, &tot_weight, - &nnet_output_deriv, - (use_xent ? &xent_deriv : NULL)); - } + ComputeChainObjfAndDeriv(chain_config, den_graph_, + sup.supervision, nnet_output, + &tot_objf, &tot_l2_term, &tot_weight, + &nnet_output_deriv, + (use_xent ? &xent_deriv : NULL)); } + if (use_xent) { // this block computes the cross-entropy objective. const CuMatrixBase &xent_output = computer->GetOutput( diff --git a/src/nnet3/nnet-chain-training.h b/src/nnet3/nnet-chain-training.h index 48d36069e1d..63febb6d719 100644 --- a/src/nnet3/nnet-chain-training.h +++ b/src/nnet3/nnet-chain-training.h @@ -130,6 +130,7 @@ class NnetChainTrainer { unordered_map smbr_factors_; unordered_map mmi_factors_; unordered_map ml_factors_; + unordered_map kl_factors_; }; From bff2dc8f0e8cb9962bc767c171dd3c2a901db363 Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Sat, 31 Mar 2018 19:40:09 -0400 Subject: [PATCH 140/174] TS changes --- .../multi_condition/run_ivector_common.sh | 9 +- egs/wsj/s5/steps/nnet3/chain/get_egs_split.sh | 5 + egs/wsj/s5/steps/nnet3/chain/train_ts.py | 11 ++ src/chain/chain-numerator.cc | 4 +- src/chain/chain-numerator.h | 2 +- src/chain/chain-supervision-splitter-test.cc | 2 +- src/chain/chain-supervision-splitter.cc | 8 +- src/chain/chain-supervision-splitter.h | 6 +- src/chain/chain-supervision.cc | 21 ++- src/chain/chain-supervision.h | 5 +- src/chain/chain-training.cc | 136 ++++++++--------- src/chain/chain-training.h | 20 +-- src/chainbin/nnet3-chain-split-and-get-egs.cc | 136 ++++++++++++++++- src/nnet3/nnet-chain-diagnostics.cc | 143 ++++++++---------- src/nnet3/nnet-chain-diagnostics.h | 1 + src/nnet3/nnet-chain-training.cc | 143 +++--------------- 16 files changed, 344 insertions(+), 308 deletions(-) diff --git a/egs/ami/s5b/local/nnet3/multi_condition/run_ivector_common.sh b/egs/ami/s5b/local/nnet3/multi_condition/run_ivector_common.sh index 2b16ad70e89..b67b614c0cf 100755 --- a/egs/ami/s5b/local/nnet3/multi_condition/run_ivector_common.sh +++ b/egs/ami/s5b/local/nnet3/multi_condition/run_ivector_common.sh @@ -18,6 +18,9 @@ rvb_affix=_rvb nnet3_affix=_cleaned # affix for exp/$mic/nnet3 directory to put iVector stuff in, so it # becomes exp/$mic/nnet3_cleaned or whatever. num_data_reps=1 +sample_rate=16000 + +max_jobs_run=10 . ./cmd.sh . ./path.sh @@ -65,7 +68,7 @@ if [ $stage -le 1 ]; then for datadir in ${train_set}_sp dev eval; do steps/make_mfcc.sh --nj $nj --mfcc-config conf/mfcc_hires.conf \ - --cmd "$train_cmd" data/$mic/${datadir}_hires + --cmd "$train_cmd --max-jobs-run $max_jobs_run" data/$mic/${datadir}_hires steps/compute_cmvn_stats.sh data/$mic/${datadir}_hires utils/fix_data_dir.sh data/$mic/${datadir}_hires done @@ -102,14 +105,14 @@ if [ $stage -le 3 ]; then --isotropic-noise-addition-probability 1 \ --num-replications ${num_data_reps} \ --max-noises-per-minute 1 \ - --source-sampling-rate 16000 \ + --source-sampling-rate $sample_rate \ ${norvb_datadir} ${norvb_datadir}_rvb${num_data_reps} utils/copy_data_dir.sh ${norvb_datadir}_rvb${num_data_reps} ${norvb_datadir}_rvb${num_data_reps}_hires utils/data/perturb_data_dir_volume.sh ${norvb_datadir}_rvb${num_data_reps}_hires steps/make_mfcc.sh --nj $nj --mfcc-config conf/mfcc_hires.conf \ - --cmd "$train_cmd" ${norvb_datadir}_rvb${num_data_reps}_hires + --cmd "$train_cmd --max-jobs-run $max_jobs_run" ${norvb_datadir}_rvb${num_data_reps}_hires steps/compute_cmvn_stats.sh ${norvb_datadir}_rvb${num_data_reps}_hires utils/fix_data_dir.sh ${norvb_datadir}_rvb${num_data_reps}_hires fi diff --git a/egs/wsj/s5/steps/nnet3/chain/get_egs_split.sh b/egs/wsj/s5/steps/nnet3/chain/get_egs_split.sh index d9db66c3153..b1a9acb82de 100755 --- a/egs/wsj/s5/steps/nnet3/chain/get_egs_split.sh +++ b/egs/wsj/s5/steps/nnet3/chain/get_egs_split.sh @@ -77,6 +77,7 @@ acwt=0.1 # For pruning phone_insertion_penalty= deriv_weights_scp= generate_egs_scp=false +include_numerator_post=true echo "$0 $@" # Print the command line for logging @@ -286,6 +287,10 @@ chain_supervision_all_opts="--supervision.frame-subsampling-factor=$alignment_su [ ! -z $left_tolerance ] && \ chain_supervision_all_opts="$chain_supervision_all_opts --supervision.left-tolerance=$left_tolerance" +if $include_numerator_post; then + chain_supervision_all_opts="$chain_supervision_all_opts --include-numerator-post" +fi + normalization_scale=1.0 lats_rspecifier="ark:gunzip -c $latdir/lat.JOB.gz |" diff --git a/egs/wsj/s5/steps/nnet3/chain/train_ts.py b/egs/wsj/s5/steps/nnet3/chain/train_ts.py index 4ca3f314154..d9419818534 100755 --- a/egs/wsj/s5/steps/nnet3/chain/train_ts.py +++ b/egs/wsj/s5/steps/nnet3/chain/train_ts.py @@ -199,6 +199,13 @@ def get_args(): input data. E.g. 8 is a reasonable setting. Note: the 'required' part of the chunk is defined by the model's {left,right}-context.""") + parser.add_argument("--trainer.optimization.do-final-combination", + dest='do_final_combination', type=str, + action=common_lib.StrToBoolAction, + choices=["true", "false"], default=False, + help="""Set this to false to disable the final + 'combine' stage (in this case we just use the + last-numbered model as the final.mdl).""") parser.add_argument("--lang", type=str, help="Lang directory to get silence pdfs.") @@ -729,6 +736,8 @@ def train(args, run_opts): float(num_archives_processed) / num_archives_to_process) objective_opts += " --mmi-factors='{0}'".format(mmi_factors) + else: + objective_opts += " --mmi-factors='output:0'" if args.ml_factor_schedule is not None: ml_factors = common_train_lib.get_schedule_string( @@ -743,6 +752,8 @@ def train(args, run_opts): float(num_archives_processed) / num_archives_to_process) objective_opts += " --kl-factors='{0}'".format(kl_factors) + else: + objective_opts += " --kl-factors='output:1'" objective_opts += " --norm-regularize={0}".format( diff --git a/src/chain/chain-numerator.cc b/src/chain/chain-numerator.cc index 139d28bdd77..973ceb352a6 100644 --- a/src/chain/chain-numerator.cc +++ b/src/chain/chain-numerator.cc @@ -148,6 +148,7 @@ BaseFloat NumeratorComputation::Forward() { void NumeratorComputation::Backward( + BaseFloat weight, CuMatrixBase *nnet_output_deriv) { const fst::StdVectorFst &fst = supervision_.fst; int32 num_states = fst.NumStates(); @@ -204,7 +205,8 @@ void NumeratorComputation::Backward( // copy this data to GPU. CuVector nnet_logprob_deriv_cuda; nnet_logprob_deriv_cuda.Swap(&nnet_logprob_derivs_); - nnet_output_deriv->AddElements(supervision_.weight, nnet_output_indexes_, + nnet_output_deriv->AddElements(supervision_.weight * weight, + nnet_output_indexes_, nnet_logprob_deriv_cuda.Data()); } diff --git a/src/chain/chain-numerator.h b/src/chain/chain-numerator.h index 15cb31e0571..691d9d72085 100644 --- a/src/chain/chain-numerator.h +++ b/src/chain/chain-numerator.h @@ -78,7 +78,7 @@ class NumeratorComputation { // Does the backward computation and (efficiently) adds the derivative of the // nnet output w.r.t. the (log-prob times supervision_.weight times // deriv_weight) to 'nnet_output_deriv'. - void Backward(CuMatrixBase *nnet_output_deriv); + void Backward(BaseFloat weight, CuMatrixBase *nnet_output_deriv); private: diff --git a/src/chain/chain-supervision-splitter-test.cc b/src/chain/chain-supervision-splitter-test.cc index ccc6cdabbfd..29bd5459a1b 100644 --- a/src/chain/chain-supervision-splitter-test.cc +++ b/src/chain/chain-supervision-splitter-test.cc @@ -273,7 +273,7 @@ int main(int argc, char *argv[]) { const char *usage = "chain-supervision-test [options]"; ParseOptions po(usage); - + int32 num_phones = 1; po.Register("num-phones", &num_phones, diff --git a/src/chain/chain-supervision-splitter.cc b/src/chain/chain-supervision-splitter.cc index d5c7c118c97..2f3ad9b4420 100644 --- a/src/chain/chain-supervision-splitter.cc +++ b/src/chain/chain-supervision-splitter.cc @@ -94,7 +94,7 @@ void SupervisionLatticeSplitter::LoadLattice(const Lattice &lat) { bool SupervisionLatticeSplitter::GetFrameRangeSupervision( int32 begin_frame, int32 num_frames, Supervision *supervision, - Lattice *out_lat) const { + Lattice *out_lat, Lattice *raw_range_lat) const { int32 end_frame = begin_frame + num_frames; // Note: end_frame is not included in the range of frames that the // output supervision object covers; it's one past the end. @@ -103,7 +103,11 @@ bool SupervisionLatticeSplitter::GetFrameRangeSupervision( Lattice lat_out; CreateRangeLattice(begin_frame, end_frame, &lat_out); - + + if (raw_range_lat) { + *raw_range_lat = lat_out; + } + PostProcessLattice(&lat_out); if (out_lat) { diff --git a/src/chain/chain-supervision-splitter.h b/src/chain/chain-supervision-splitter.h index bf73a2c8833..e0d2397466e 100644 --- a/src/chain/chain-supervision-splitter.h +++ b/src/chain/chain-supervision-splitter.h @@ -95,7 +95,8 @@ class SupervisionLatticeSplitter { bool GetFrameRangeSupervision(int32 begin_frame, int32 frames_per_sequence, chain::Supervision *supervision, - Lattice *lat = NULL) const; + Lattice *lat = NULL, + Lattice *raw_range_lat = NULL) const; bool GetFrameRangeProtoSupervision( const ContextDependencyInterface &ctx_dep, @@ -125,6 +126,9 @@ class SupervisionLatticeSplitter { void Check() const; }; + const Lattice& GetLattice() const { return lat_; } + + const std::StdVectorFst& ToleranceFst() const { return tolerance_fst_; } private: // Creates an output lattice covering frames begin_frame <= t < end_frame, // assuming that the corresponding state-range that we need to diff --git a/src/chain/chain-supervision.cc b/src/chain/chain-supervision.cc index 8a9fe62465f..a2ac419468b 100644 --- a/src/chain/chain-supervision.cc +++ b/src/chain/chain-supervision.cc @@ -751,6 +751,9 @@ void Supervision::Write(std::ostream &os, bool binary) const { if (numerator_post_targets.NumRows() > 0) { WriteToken(os, binary, ""); numerator_post_targets.Write(os, binary); + + WriteToken(os, binary, ""); + WriteBasicType(os, binary, numerator_log_prob); } if (binary == false) { // In text mode, write the FST without any compactification. @@ -793,6 +796,7 @@ void Supervision::Swap(Supervision *other) { std::swap(e2e, other->e2e); std::swap(e2e_fsts, other->e2e_fsts); std::swap(numerator_post_targets, other->numerator_post_targets); + std::swap(numerator_log_prob, other->numerator_log_prob); } void Supervision::Read(std::istream &is, bool binary) { @@ -816,6 +820,11 @@ void Supervision::Read(std::istream &is, bool binary) { ExpectToken(is, binary, ""); numerator_post_targets.Read(is, binary); + if (PeekToken(is, binary) == 'N') { + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &numerator_log_prob); + } + if (PeekToken(is, binary) == '/') { ExpectToken(is, binary, ""); return; @@ -888,9 +897,9 @@ int32 ComputeFstStateTimes(const fst::StdVectorFst &fst, return total_length; } -Supervision::Supervision(int32 dim, const Posterior &labels): +Supervision::Supervision(int32 dim, const Posterior &labels, BaseFloat log_prob): weight(1.0), num_sequences(1), frames_per_sequence(labels.size()), - label_dim(dim), e2e(false) { + label_dim(dim), e2e(false), numerator_log_prob(log_prob) { SparseMatrix sparse_feats(dim, labels); numerator_post_targets = sparse_feats; } @@ -900,7 +909,8 @@ Supervision::Supervision(const Supervision &other): frames_per_sequence(other.frames_per_sequence), label_dim(other.label_dim), fst(other.fst), e2e(other.e2e), e2e_fsts(other.e2e_fsts), - numerator_post_targets(other.numerator_post_targets) { } + numerator_post_targets(other.numerator_post_targets), + numerator_log_prob(other.numerator_log_prob) { } // This static function is called by AppendSupervision if the supervisions @@ -937,12 +947,17 @@ void AppendSupervisionPost(const std::vector &input, std::vector output_targets(num_inputs); output_targets[0] = &(input[0]->numerator_post_targets); + KALDI_ASSERT(kaldi::ApproxEqual( + (*output_supervision)[0].numerator_log_prob, + input[0]->numerator_log_prob)); + for (int32 i = 1; i < num_inputs; i++) { output_targets[i] = &(input[i]->numerator_post_targets); KALDI_ASSERT(output_targets[i]->NumRows() > 0); KALDI_ASSERT(output_targets[i]->NumCols() == label_dim); KALDI_ASSERT(input[i]->frames_per_sequence == (*output_supervision)[0].frames_per_sequence); + (*output_supervision)[0].numerator_log_prob += input[i]->numerator_log_prob; } AppendGeneralMatrixRows( diff --git a/src/chain/chain-supervision.h b/src/chain/chain-supervision.h index 09b06012d09..e7d4ba41de2 100644 --- a/src/chain/chain-supervision.h +++ b/src/chain/chain-supervision.h @@ -277,11 +277,12 @@ struct Supervision { std::vector e2e_fsts; GeneralMatrix numerator_post_targets; + BaseFloat numerator_log_prob; Supervision(): weight(1.0), num_sequences(1), frames_per_sequence(-1), - label_dim(-1), e2e(false) { } + label_dim(-1), e2e(false), numerator_log_prob(0.0) { } - Supervision(int32 dim, const Posterior &labels); + Supervision(int32 dim, const Posterior &labels, BaseFloat log_prob = 0.0); Supervision(const Supervision &other); diff --git a/src/chain/chain-training.cc b/src/chain/chain-training.cc index 92fbaf7b3a7..59a57e588bb 100644 --- a/src/chain/chain-training.cc +++ b/src/chain/chain-training.cc @@ -141,57 +141,46 @@ void ComputeChainObjfAndDerivE2e(const ChainTrainingOptions &opts, } } -void ComputeKLObjfAndDeriv(const ChainTrainingOptions &opts, - const DenominatorGraph &den_graph, - const Supervision &supervision, - const CuMatrixBase &nnet_output, - BaseFloat *objf, - BaseFloat *l2_term, - BaseFloat *weight, - CuMatrixBase *nnet_output_deriv, - CuMatrix *xent_output_deriv) { - KALDI_ASSERT(supervision.numerator_post_targets.NumRows() > 0); - KALDI_ASSERT(nnet_output.NumRows() == supervision.num_sequences * supervision.frames_per_sequence); - KALDI_ASSERT(supervision.numerator_post_targets.NumRows() == nnet_output.NumRows()); - - BaseFloat den_logprob_weighted; +void ComputeKLNumeratorObjfAndDeriv(const ChainTrainingOptions &opts, + const DenominatorGraph &den_graph, + const CuMatrixBase &nnet_output, + BaseFloat supervision_weight, int32 num_sequences, + BaseFloat *objf, + BaseFloat *weight, + CuMatrixBase *nnet_output_deriv, + CuMatrixBase *xent_output_deriv) { + CuMatrix deriv; + + if (nnet_output_deriv) + KALDI_ASSERT(nnet_output.NumRows() == nnet_output_deriv->NumRows() + && nnet_output.NumCols() == nnet_output_deriv->NumCols()); + + if (xent_output_deriv) { + KALDI_ASSERT(nnet_output.NumRows() == xent_output_deriv->NumRows() + && nnet_output.NumCols() == xent_output_deriv->NumCols()); + } + + if (xent_output_deriv != NULL || nnet_output_deriv != NULL) + deriv.Resize(nnet_output.NumRows(), nnet_output.NumCols()); + + BaseFloat logprob_weighted; bool ok = true; - if (nnet_output_deriv != NULL) - nnet_output_deriv->SetZero(); - { // Doing the denominator first helps to reduce the maximum - // memory use, as we can set 'xent_deriv' to nonempty after - // we've freed the memory in this object. + { DenominatorComputation denominator(opts, den_graph, - supervision.num_sequences, + num_sequences, nnet_output); - den_logprob_weighted = supervision.weight * denominator.Forward(); + logprob_weighted = supervision_weight * denominator.Forward(); if (nnet_output_deriv) - ok = denominator.Backward(-supervision.weight, - nnet_output_deriv); + ok = denominator.Backward(supervision_weight * opts.kl_factor, + &deriv); } - if (xent_output_deriv != NULL) { - // the reason for kStrideEqualNumCols is so that we can share the memory - // block with the memory that was used for exp_nnet_output_transposed_ from - // chain-denominator.cc, which has just been freed; it also uses the - // kStrideEqualNumCols arg (its shape is the transpose of this matrix's - // shape). - xent_output_deriv->Resize(nnet_output.NumRows(), nnet_output.NumCols(), - kSetZero, kStrideEqualNumCols); - supervision.numerator_post_targets.CopyToMat(xent_output_deriv); - xent_output_deriv->Scale(supervision.weight); - if (nnet_output_deriv) - nnet_output_deriv->AddMat(1.0, *xent_output_deriv); - } else if (nnet_output_deriv) { - CuMatrix numerator_post(nnet_output.NumRows(), nnet_output.NumCols()); - supervision.numerator_post_targets.CopyToMat(&numerator_post); - nnet_output_deriv->AddMat(supervision.weight, numerator_post); - } + int32 frames_per_sequence = nnet_output.NumRows() / num_sequences; - *objf = -den_logprob_weighted; - *weight = supervision.weight * supervision.num_sequences * supervision.frames_per_sequence; + *objf = logprob_weighted; + *weight = supervision_weight * num_sequences * frames_per_sequence; if (!((*objf) - (*objf) == 0) || !ok) { // inf or NaN detected, or denominator computation returned false. if (nnet_output_deriv) @@ -205,6 +194,19 @@ void ComputeKLObjfAndDeriv(const ChainTrainingOptions &opts, << ", setting objective function to " << default_objf << " per frame."; *objf = default_objf * *weight; + } else { + if (xent_output_deriv) { + // the reason for kStrideEqualNumCols is so that we can share the memory + // block with the memory that was used for exp_nnet_output_transposed_ from + // chain-denominator.cc, which has just been freed; it also uses the + // kStrideEqualNumCols arg (its shape is the transpose of this matrix's + // shape). + xent_output_deriv->AddMat(1.0, deriv); + if (nnet_output_deriv) + nnet_output_deriv->AddMat(1.0, deriv); + } else if (nnet_output_deriv) { + nnet_output_deriv->AddMat(1.0, deriv); + } } // This code helps us see how big the derivatives are, on average, @@ -214,23 +216,13 @@ void ComputeKLObjfAndDeriv(const ChainTrainingOptions &opts, if (GetVerboseLevel() >= 1 && nnet_output_deriv != NULL && RandInt(0, 10) == 0) { int32 tot_frames = nnet_output_deriv->NumRows(); CuVector row_products(tot_frames); - row_products.AddDiagMat2(1.0, *nnet_output_deriv, kNoTrans, 0.0); + row_products.AddDiagMat2(1.0, deriv, kNoTrans, 0.0); Vector row_products_cpu(row_products); - Vector row_products_per_frame(supervision.frames_per_sequence); + Vector row_products_per_frame(frames_per_sequence); for (int32 i = 0; i < tot_frames; i++) - row_products_per_frame(i / supervision.num_sequences) += row_products_cpu(i); + row_products_per_frame(i / num_sequences) += row_products_cpu(i); KALDI_LOG << "Derivs per frame are " << row_products_per_frame; } - - if (opts.l2_regularize == 0.0) { - *l2_term = 0.0; - } else { - // compute the l2 penalty term and its derivative - BaseFloat scale = supervision.weight * opts.l2_regularize; - *l2_term = -0.5 * scale * TraceMatMat(nnet_output, nnet_output, kTrans); - if (nnet_output_deriv) - nnet_output_deriv->AddMat(-1.0 * scale, nnet_output); - } } @@ -279,32 +271,34 @@ void ComputeChainObjfAndDeriv(const ChainTrainingOptions &opts, kSetZero, kStrideEqualNumCols); } - if (opts.kl_factor > 0.0) { + if (opts.mmi_factor > 0.0) { + NumeratorComputation numerator(supervision, nnet_output); + // note: supervision.weight is included as a factor in the derivative from + // the numerator object, as well as the returned logprob. + num_logprob_weighted = opts.mmi_factor * numerator.Forward(); + if (xent_output_deriv) { - supervision.numerator_post_targets.CopyToMat(xent_output_deriv); - xent_output_deriv->Scale(supervision.weight * opts.kl_factor); + numerator.Backward(opts.mmi_factor, xent_output_deriv); if (nnet_output_deriv) nnet_output_deriv->AddMat(1.0, *xent_output_deriv); } else if (nnet_output_deriv) { - CuMatrix numerator_post(nnet_output.NumRows(), nnet_output.NumCols()); - supervision.numerator_post_targets.CopyToMat(&numerator_post); - nnet_output_deriv->AddMat(supervision.weight * opts.kl_factor, numerator_post); + numerator.Backward(opts.mmi_factor, nnet_output_deriv); } } - if (opts.mmi_factor > 0.0) { - NumeratorComputation numerator(supervision, nnet_output); - // note: supervision.weight is included as a factor in the derivative from - // the numerator object, as well as the returned logprob. - num_logprob_weighted = numerator.Forward(); - + if (opts.kl_factor > 0.0) { if (xent_output_deriv) { - numerator.Backward(xent_output_deriv); + CuMatrix numerator_post(nnet_output.NumRows(), nnet_output.NumCols()); + supervision.numerator_post_targets.CopyToMat(&numerator_post); + xent_output_deriv->AddMat(supervision.weight * opts.kl_factor, numerator_post); if (nnet_output_deriv) - nnet_output_deriv->AddMat(1.0, *xent_output_deriv); + nnet_output_deriv->AddMat(supervision.weight * opts.kl_factor, numerator_post); } else if (nnet_output_deriv) { - numerator.Backward(nnet_output_deriv); + CuMatrix numerator_post(nnet_output.NumRows(), nnet_output.NumCols()); + supervision.numerator_post_targets.CopyToMat(&numerator_post); + nnet_output_deriv->AddMat(supervision.weight * opts.kl_factor, numerator_post); } + num_logprob_weighted += opts.kl_factor * supervision.numerator_log_prob * supervision.weight; } *objf = num_logprob_weighted - den_logprob_weighted; @@ -386,7 +380,7 @@ void ComputeChainSmbrObjfAndDeriv(const ChainTrainingOptions &opts, // note: supervision.weight is included as a factor in the derivative from // the numerator object, and the logprob too. num_logprob_weighted = (opts.mmi_factor + opts.ml_factor) * numerator.Forward(); - numerator.Backward(&numerator_post); + numerator.Backward(1.0, &numerator_post); #if HAVE_CUDA == 1 if (!CuDevice::Instantiate().Enabled()) #endif diff --git a/src/chain/chain-training.h b/src/chain/chain-training.h index 8367b41bf3d..a5edf784795 100644 --- a/src/chain/chain-training.h +++ b/src/chain/chain-training.h @@ -72,6 +72,7 @@ struct ChainTrainingOptions { BaseFloat smbr_factor; BaseFloat smbr_threshold; + bool self_kl; bool norm_regularize; BaseFloat smbr_leaky_hmm_coefficient; @@ -82,7 +83,7 @@ struct ChainTrainingOptions { xent_regularize(0.0), use_smbr_objective(false), exclude_silence(false), one_silence_class(false), mmi_factor(1.0), ml_factor(0.0), kl_factor(0.0), - smbr_factor(0.0), smbr_threshold(0.0), + smbr_factor(0.0), smbr_threshold(0.0), self_kl(false), norm_regularize(false), smbr_leaky_hmm_coefficient(-1) { } @@ -234,15 +235,14 @@ void ComputeChainSmbrObjfAndDeriv( This function uses supervision as numerator and does denominator computation. It can be uses, where numerator is fixed e.g. TS learning. */ -void ComputeKLObjfAndDeriv(const ChainTrainingOptions &opts, - const DenominatorGraph &den_graph, - const Supervision &supervision, - const CuMatrixBase &nnet_output, - BaseFloat *objf, - BaseFloat *l2_term, - BaseFloat *weight, - CuMatrixBase *nnet_output_deriv, - CuMatrix *xent_output_deriv = NULL); +void ComputeKLNumeratorObjfAndDeriv(const ChainTrainingOptions &opts, + const DenominatorGraph &den_graph, + const CuMatrixBase &nnet_output, + BaseFloat supervision_weight, int32 num_sequences, + BaseFloat *objf, + BaseFloat *weight, + CuMatrixBase *nnet_output_deriv, + CuMatrixBase *xent_output_deriv = NULL); } // namespace chain } // namespace kaldi diff --git a/src/chainbin/nnet3-chain-split-and-get-egs.cc b/src/chainbin/nnet3-chain-split-and-get-egs.cc index 820ad29ac7d..af34639a0eb 100644 --- a/src/chainbin/nnet3-chain-split-and-get-egs.cc +++ b/src/chainbin/nnet3-chain-split-and-get-egs.cc @@ -32,6 +32,80 @@ namespace kaldi { namespace nnet3 { +/** This function converts lattice to FSA with weight equal to + sum of acoustic and language score, and pdf_id + 1 as labels. + This assumes that the acoustic and language scores are scaled appropriately. +*/ +void ConvertLatticeToPdfLabels( + const TransitionModel &tmodel, + const Lattice &ifst, + fst::StdVectorFst *ofst) { + typedef fst::ArcTpl ArcIn; + typedef fst::StdArc ArcOut; + typedef ArcIn::StateId StateId; + ofst->DeleteStates(); + // The states will be numbered exactly the same as the original FST. + // Add the states to the new FST. + StateId num_states = ifst.NumStates(); + for (StateId s = 0; s < num_states; s++) + ofst->AddState(); + ofst->SetStart(ifst.Start()); + for (StateId s = 0; s < num_states; s++) { + LatticeWeight final_iweight = ifst.Final(s); + if (final_iweight != LatticeWeight::Zero()) { + fst::TropicalWeight final_oweight; + ConvertLatticeWeight(final_iweight, &final_oweight); + ofst->SetFinal(s, final_oweight); + } + for (fst::ArcIterator iter(ifst, s); + !iter.Done(); + iter.Next()) { + const ArcIn &arc = iter.Value(); + KALDI_PARANOID_ASSERT(arc.weight != LatticeWeight::Zero()); + ArcOut oarc; + ConvertLatticeWeight(arc.weight, &oarc.weight); + if (arc.ilabel == 0) + oarc.ilabel = 0; // epsilon arc + else + oarc.ilabel = tmodel.TransitionIdToPdf(arc.ilabel) + 1; // pdf + 1 + oarc.olabel = oarc.ilabel; + oarc.nextstate = arc.nextstate; + ofst->AddArc(s, oarc); + } + } +} + +BaseFloat LatticeToNumeratorPost(const Lattice &lat, + const TransitionModel &trans_model, + const fst::StdVectorFst &normalization_fst, + BaseFloat lm_scale, std::string key, + Posterior *post) { + Lattice lat_copy(lat); + + if (normalization_fst.NumStates() > 0) + fst::ScaleLattice(fst::GraphLatticeScale(lm_scale), &lat_copy); + + fst::StdVectorFst sup_fst; + ConvertLatticeToPdfLabels(trans_model, lat_copy, &sup_fst); + + if (normalization_fst.NumStates() > 0 && + !chain::AddWeightToFst(normalization_fst, &sup_fst)) { + KALDI_WARN << "For utterance " << key << ", feature frames " + << ", FST was empty after composing with normalization FST. " + << "This should be extremely rare (a few per corpus, at most)"; + } + + // Convert fst to lattice to extract posterior using forward backward. + ConvertFstToLattice(sup_fst, &lat_copy); + + kaldi::uint64 props = lat_copy.Properties(fst::kFstProperties, false); + if (!(props & fst::kTopSorted)) { + if (fst::TopSort(&lat_copy) == false) + KALDI_ERR << "Cycles detected in lattice."; + } + + return LatticeForwardBackward(lat_copy, post); +} /** This function does all the processing for one utterance, and outputs the @@ -47,8 +121,9 @@ static bool ProcessFile(const chain::SupervisionOptions &sup_opts, int32 ivector_period, const TransitionModel &trans_model, const chain::SupervisionLatticeSplitter &sup_lat_splitter, - const VectorBase *deriv_weights, - int32 supervision_length_tolerance, + const VectorBase *deriv_weights, + bool include_numerator_post, BaseFloat min_post, + int32 supervision_length_tolerance, const std::string &utt_id, bool compress, UtteranceSplitter *utt_splitter, @@ -92,11 +167,56 @@ static bool ProcessFile(const chain::SupervisionOptions &sup_opts, chain::Supervision supervision_part; + + Lattice *lat_part = NULL; + + if (include_numerator_post) + lat_part = new Lattice(); + if (!sup_lat_splitter.GetFrameRangeSupervision(start_frame_subsampled, num_frames_subsampled, - &supervision_part)) + &supervision_part, NULL, + lat_part)) return false; + if (include_numerator_post) { + Posterior pdf_post; + supervision_part.numerator_log_prob = LatticeToNumeratorPost( + *lat_part, trans_model, normalization_fst, + sup_opts.lm_scale, utt_id, &pdf_post); + KALDI_ASSERT(pdf_post.size() == num_frames_subsampled); + + Posterior check_post; + BaseFloat check_prob; + if (GetVerboseLevel() >= 2) { + check_prob = LatticeToNumeratorPost( + sup_lat_splitter.GetLattice(), trans_model, normalization_fst, + sup_opts.lm_scale, utt_id, &check_post); + } + KALDI_VLOG(2) << "log-prob=" << supervision_part.numerator_log_prob + << "; check-prob=" << check_prob; + + Posterior labels; + labels.resize(num_frames_subsampled); + for (int32 i = 0; i < num_frames_subsampled; i++) { + for (int32 j = 0; j < pdf_post[i].size(); j++) { + BaseFloat post = pdf_post[i][j].second; + KALDI_ASSERT(pdf_post[i][j].first > 0); + KALDI_VLOG(2) << pdf_post[i][j].first << " " << pdf_post[i][j].second + << "; " + << check_post[i + start_frame_subsampled][j].first + << check_post[i + start_frame_subsampled][j].second; + if (post > min_post) { + labels[i].push_back(std::make_pair( + pdf_post[i][j].first - 1, post)); // Convert from 1-index to 0-index + } + } + } + + SparseMatrix smat(trans_model.NumPdfs(), labels); + supervision_part.numerator_post_targets = smat; + } + if (normalization_fst.NumStates() > 0 && !chain::AddWeightToSupervisionFst(normalization_fst, &supervision_part)) { @@ -219,6 +339,9 @@ int main(int argc, char *argv[]) { int32 srand_seed = 0; std::string online_ivector_rspecifier, deriv_weights_rspecifier; + bool include_numerator_post = true; + BaseFloat min_post = 1e-8; + ParseOptions po(usage); po.Register("compress", &compress, "If true, write egs with input features " "in compressed format (recommended). Update: this is now " @@ -242,6 +365,10 @@ int main(int argc, char *argv[]) { "whether a frame's gradient must be backpropagated or not. " "Not specifying this is equivalent to specifying a vector of " "all 1s."); + po.Register("include-numerator-post", &include_numerator_post, + "Include numerator posterior"); + po.Register("min-post", &min_post, "Minimum posterior to keep; this will " + "avoid dumping out all posteriors."); eg_config.Register(&po); @@ -374,7 +501,8 @@ int main(int argc, char *argv[]) { if (!ProcessFile(sup_opts, normalization_fst, feats, online_ivector_feats, online_ivector_period, trans_model, sup_lat_splitter, - deriv_weights, supervision_length_tolerance, + deriv_weights, include_numerator_post, min_post, + supervision_length_tolerance, key, compress, &utt_splitter, &example_writer)) num_err++; diff --git a/src/nnet3/nnet-chain-diagnostics.cc b/src/nnet3/nnet-chain-diagnostics.cc index 3cb1f317e1b..e90da70cf2f 100644 --- a/src/nnet3/nnet-chain-diagnostics.cc +++ b/src/nnet3/nnet-chain-diagnostics.cc @@ -23,70 +23,32 @@ namespace kaldi { namespace nnet3 { -NnetChainComputeProb::NnetChainComputeProb( - const NnetComputeProbOptions &nnet_config, - const chain::ChainTrainingOptions &chain_config, - const fst::StdVectorFst &den_fst, - const Nnet &nnet): - nnet_config_(nnet_config), - chain_config_(chain_config), - den_graph_(den_fst, nnet.OutputDim("output")), - nnet_(nnet), - compiler_(nnet, nnet_config_.optimize_config, nnet_config_.compiler_config), - deriv_nnet_owned_(true), - deriv_nnet_(NULL), - num_minibatches_processed_(0) { - if (nnet_config_.compute_deriv) { - deriv_nnet_ = new Nnet(nnet_); - ScaleNnet(0.0, deriv_nnet_); - SetNnetAsGradient(deriv_nnet_); // force simple update - } else if (nnet_config_.store_component_stats) { - KALDI_ERR << "If you set store_component_stats == true and " - << "compute_deriv == false, use the other constructor."; - } - - if (chain_config.use_smbr_objective && - (chain_config.exclude_silence || chain_config.one_silence_class)) { - if (chain_config.silence_pdfs_str.empty()) { - KALDI_ERR << "--silence-pdfs is required if --exclude-silence or " - << "--one-silence-class is true."; - } - +void NnetChainComputeProb::ParseObjectiveOpts( + const chain::ChainTrainingOptions &chain_config) { + if (!chain_config.silence_pdfs_str.empty()) { std::vector silence_pdfs; SplitStringToVector(chain_config.silence_pdfs_str, ":,", false, &silence_pdfs); - int32 num_pdfs = nnet.OutputDim("output"); - std::vector indices(num_pdfs, -1); - - if (chain_config.exclude_silence) { - for (size_t i = 0; i < num_pdfs; i++) { - indices[i] = i; - } + int32 num_pdfs = nnet_.OutputDim("output"); + std::vector indices(num_pdfs); + for (size_t i = 0; i < num_pdfs; i++) { + indices[i] = i; + } - for (std::vector::iterator it = silence_pdfs.begin(); - it != silence_pdfs.end(); ++it) { - int32 pdf = std::atoi(it->c_str()); - if (pdf > num_pdfs) - KALDI_ERR << "Invalid pdf " << pdf << " in silence-pdfs " - << chain_config.silence_pdfs_str; - indices[pdf] = -1; - } - } else { - for (std::vector::iterator it = silence_pdfs.begin(); - it != silence_pdfs.end(); ++it) { - int32 pdf = std::atoi(it->c_str()); - if (pdf > num_pdfs) - KALDI_ERR << "Invalid pdf " << pdf << " in silence-pdfs " - << chain_config.silence_pdfs_str; - indices[pdf] = pdf; - } + for (std::vector::iterator it = silence_pdfs.begin(); + it != silence_pdfs.end(); ++it) { + int32 pdf = std::atoi(it->c_str()); + if (pdf > num_pdfs) + KALDI_ERR << "Invalid pdf " << pdf << " in silence-pdfs " + << chain_config.silence_pdfs_str; + indices[pdf] = -1; } sil_indices_.Resize(num_pdfs); sil_indices_.CopyFromVec(indices); } - + if (!chain_config.smbr_factors_str.empty()) ParseObjectiveScales(chain_config.smbr_factors_str, &smbr_factors_); @@ -101,6 +63,30 @@ NnetChainComputeProb::NnetChainComputeProb( &kl_factors_); } +NnetChainComputeProb::NnetChainComputeProb( + const NnetComputeProbOptions &nnet_config, + const chain::ChainTrainingOptions &chain_config, + const fst::StdVectorFst &den_fst, + const Nnet &nnet): + nnet_config_(nnet_config), + chain_config_(chain_config), + den_graph_(den_fst, nnet.OutputDim("output")), + nnet_(nnet), + compiler_(nnet, nnet_config_.optimize_config, nnet_config_.compiler_config), + deriv_nnet_owned_(true), + deriv_nnet_(NULL), + num_minibatches_processed_(0) { + if (nnet_config_.compute_deriv) { + deriv_nnet_ = new Nnet(nnet_); + ScaleNnet(0.0, deriv_nnet_); + SetNnetAsGradient(deriv_nnet_); // force simple update + } else if (nnet_config_.store_component_stats) { + KALDI_ERR << "If you set store_component_stats == true and " + << "compute_deriv == false, use the other constructor."; + } + + ParseObjectiveOpts(chain_config); +} NnetChainComputeProb::NnetChainComputeProb( const NnetComputeProbOptions &nnet_config, @@ -118,32 +104,9 @@ NnetChainComputeProb::NnetChainComputeProb( KALDI_ASSERT(den_graph_.NumPdfs() > 0); KALDI_ASSERT(nnet_config.store_component_stats && !nnet_config.compute_deriv); - if (!chain_config.silence_pdfs_str.empty()) { - std::vector silence_pdfs; - SplitStringToVector(chain_config.silence_pdfs_str, ":,", false, - &silence_pdfs); - - int32 num_pdfs = nnet->OutputDim("output"); - std::vector indices(num_pdfs); - for (size_t i = 0; i < num_pdfs; i++) { - indices[i] = i; - } - - for (std::vector::iterator it = silence_pdfs.begin(); - it != silence_pdfs.end(); ++it) { - int32 pdf = std::atoi(it->c_str()); - if (pdf > num_pdfs) - KALDI_ERR << "Invalid pdf " << pdf << " in silence-pdfs " - << chain_config.silence_pdfs_str; - indices[pdf] = -1; - } - - sil_indices_.Resize(num_pdfs); - sil_indices_.CopyFromVec(indices); - } + ParseObjectiveOpts(chain_config); } - const Nnet &NnetChainComputeProb::GetDeriv() const { if (!nnet_config_.compute_deriv) KALDI_ERR << "GetDeriv() called when no derivatives were requested."; @@ -239,12 +202,13 @@ void NnetChainComputeProb::ProcessOutputs(const NnetChainExample &eg, xent_deriv.Resize(nnet_output.NumRows(), nnet_output.NumCols(), kUndefined); - BaseFloat tot_like, tot_mmi_objf, tot_l2_term, tot_weight; - if (chain_config_copy.kl_factor > 0.0) - KALDI_ASSERT(sup.supervision.numerator_post_targets.NumRows() > 0 - && chain_config_copy.smbr_factor == 0.0); + if (chain_config_copy.kl_factor > 0.0) { + KALDI_ASSERT(chain_config_copy.smbr_factor == 0.0); + if (!chain_config_copy.self_kl) + KALDI_ASSERT(sup.supervision.numerator_post_targets.NumRows() > 0); + } if (chain_config_copy.smbr_factor > 0.0) ComputeChainSmbrObjfAndDeriv( @@ -254,12 +218,25 @@ void NnetChainComputeProb::ProcessOutputs(const NnetChainExample &eg, (nnet_config_.compute_deriv ? &nnet_output_deriv : NULL), (use_xent ? &xent_deriv : NULL), sil_indices_.Dim() ? &sil_indices_ : NULL); - else + else { ComputeChainObjfAndDeriv(chain_config_copy, den_graph_, sup.supervision, nnet_output, &tot_like, &tot_l2_term, &tot_weight, (nnet_config_.compute_deriv ? &nnet_output_deriv : NULL), (use_xent ? &xent_deriv : NULL)); + + if (chain_config_copy.self_kl) { + const CuMatrixBase &teacher_nnet_output = + computer->GetOutput(sup.name + "-teacher"); + + BaseFloat num_objf = 0, num_weight = 0.0; + ComputeKLNumeratorObjfAndDeriv(chain_config_copy, den_graph_, teacher_nnet_output, + sup.supervision.weight, sup.supervision.num_sequences, + &num_objf, &num_weight, + &nnet_output_deriv, + (use_xent ? &xent_deriv : NULL)); + } + } // note: in this context we don't want to apply 'sup.deriv_weights' because // this code is used only in combination, where it's part of an L-BFGS diff --git a/src/nnet3/nnet-chain-diagnostics.h b/src/nnet3/nnet-chain-diagnostics.h index 916fc8ab981..9447eee4f5b 100644 --- a/src/nnet3/nnet-chain-diagnostics.h +++ b/src/nnet3/nnet-chain-diagnostics.h @@ -77,6 +77,7 @@ class NnetChainComputeProb { const fst::StdVectorFst &den_fst, Nnet *nnet); + void ParseObjectiveOpts(const chain::ChainTrainingOptions &chain_config); // Reset the likelihood stats, and the derivative stats (if computed). void Reset(); diff --git a/src/nnet3/nnet-chain-training.cc b/src/nnet3/nnet-chain-training.cc index a88c1fb5132..11a519ac30b 100644 --- a/src/nnet3/nnet-chain-training.cc +++ b/src/nnet3/nnet-chain-training.cc @@ -457,129 +457,6 @@ void NnetChainTrainer::TrainInternalBackstitch(const NnetChainExample &eg, ScaleNnet(0.0, delta_nnet_); } -/* -void NnetChainTrainer::ProcessOutputs(bool is_backstitch_step2, - const NnetExample &eg, - NnetComputer *computer) { - // In backstitch training, the output-name with the "_backstitch" suffix is - // the one computed after the first, backward step of backstitch. - const std::string suffix = (is_backstitch_step2 ? "_backstitch" : ""); - std::vector::const_iterator iter = eg.io.begin(), - end = eg.io.end(); - for (; iter != end; ++iter) { - const NnetIo &io = *iter; - int32 node_index = nnet_->GetNodeIndex(io.name); - KALDI_ASSERT(node_index >= 0); - if (nnet_->IsOutputNode(node_index)) { - const CuMatrixBase &nnet_output = computer->GetOutput(io.name); - CuMatrix nnet_output_deriv(nnet_output.NumRows(), - nnet_output.NumCols(), - kUndefined); - bool use_xent = (opts_.chain_config.xent_regularize != 0.0); - std::string xent_name = io.name + "-xent"; // typically "output-xent". - CuMatrix xent_deriv; - - BaseFloat tot_objf, tot_l2_term, tot_weight; - - int32 num_sequences = NumSequencesInChainEg(io.indexes); - KALDI_ASSERT(io.features.NumRows() % num_sequences == 0); - int32 frames_per_sequence = io.features.NumRows() / num_sequences; - ComputeKLObjfAndDeriv(opts_.chain_config, den_graph_, - io.features, 1.0, nnet_output, - num_sequences, frames_per_sequence, - &tot_objf, &tot_l2_term, &tot_weight, - &nnet_output_deriv, - (use_xent ? &xent_deriv : NULL)); - - BaseFloat objf_scale = 1.0; - { - unordered_map::iterator it = - objective_scales_.find(io.name); - - if (it != objective_scales_.end()) { - objf_scale = it->second; - tot_objf *= it->second; - tot_l2_term *= it->second; - tot_weight *= it->second; - nnet_output_deriv.Scale(it->second); - } - } - - if (use_xent) { - // this block computes the cross-entropy objective. - const CuMatrixBase &xent_output = computer->GetOutput( - xent_name); - // at this point, xent_deriv is posteriors derived from the numerato - // computation. note, xent_objf has a factor of '.supervision.weight' - BaseFloat xent_objf = TraceMatMat(xent_output, xent_deriv, kTrans); - - { - unordered_map::iterator it = - objective_scales_.find(xent_name); - - if (it != objective_scales_.end()) { - xent_objf *= it->second; - xent_deriv.Scale(it->second); - } - } - - objf_info_[xent_name + suffix].UpdateStats(xent_name + suffix, - opts_.nnet_config.print_interval, - num_minibatches_processed_, - tot_weight, xent_objf); - } - - if (opts_.apply_deriv_weights && io.deriv_weights.Dim() > 0) { - CuVector cu_deriv_weights(io.deriv_weights); - nnet_output_deriv.MulRowsVec(cu_deriv_weights); - if (use_xent) - xent_deriv.MulRowsVec(cu_deriv_weights); - } - - std::vector objective_values; - objective_values.push_back(tot_l2_term); - - { - unordered_map::iterator it - = objf_info_.find(io.name + suffix); - - if (it == objf_info_.end()) { - std::vector aux_objf_scales(1, objf_scale); // l2_term - - ObjectiveFunctionInfo totals(objf_scale, aux_objf_scales); - it = objf_info_.insert(it, std::make_pair(io.name + suffix, totals)); - } - - if (opts_.accumulate_avg_deriv && - it->second.deriv_sum.Dim() == 0) - it->second.deriv_sum.Resize(nnet_output.NumCols()); - - if (it->second.deriv_sum.Dim() > 0) - it->second.deriv_sum.AddRowSumMat(1.0, nnet_output_deriv, 1.0); - - it->second.UpdateStats(io.name + suffix, - opts_.nnet_config.print_interval, - num_minibatches_processed_, - tot_weight, tot_objf, objective_values); - } - - computer->AcceptInput(io.name, &nnet_output_deriv); - - if (use_xent) { - xent_deriv.Scale(opts_.chain_config.xent_regularize); - if (opts_.accumulate_avg_deriv && - objf_info_[xent_name + suffix].deriv_sum.Dim() == 0) - objf_info_[xent_name + suffix].deriv_sum.Resize(nnet_output.NumCols()); - if (objf_info_[xent_name + suffix].deriv_sum.Dim() > 0) - objf_info_[xent_name + suffix].deriv_sum.AddRowSumMat( - 1.0, xent_deriv, 1.0); - computer->AcceptInput(xent_name, &xent_deriv); - } - } - } -} -*/ - void NnetChainTrainer::ProcessOutputs(bool is_backstitch_step2, const NnetChainExample &eg, NnetComputer *computer) { @@ -631,9 +508,11 @@ void NnetChainTrainer::ProcessOutputs(bool is_backstitch_step2, BaseFloat tot_objf, tot_mmi_objf, tot_l2_term, tot_weight; - if (chain_config.kl_factor > 0.0) - KALDI_ASSERT(sup.supervision.numerator_post_targets.NumRows() > 0 - && chain_config.smbr_factor == 0.0); + if (chain_config.kl_factor > 0.0) { + KALDI_ASSERT(chain_config.smbr_factor == 0.0); + if (!chain_config.self_kl) + KALDI_ASSERT(sup.supervision.numerator_post_targets.NumRows() > 0); + } if (chain_config.smbr_factor > 0.0) { ComputeChainSmbrObjfAndDeriv(chain_config, den_graph_, @@ -649,6 +528,18 @@ void NnetChainTrainer::ProcessOutputs(bool is_backstitch_step2, &tot_objf, &tot_l2_term, &tot_weight, &nnet_output_deriv, (use_xent ? &xent_deriv : NULL)); + + if (chain_config.self_kl) { + const CuMatrixBase &teacher_nnet_output = + computer->GetOutput(sup.name + "-teacher"); + + BaseFloat num_objf = 0, num_weight = 0.0; + ComputeKLNumeratorObjfAndDeriv(chain_config, den_graph_, teacher_nnet_output, + sup.supervision.weight, sup.supervision.num_sequences, + &num_objf, &num_weight, + &nnet_output_deriv, + (use_xent ? &xent_deriv : NULL)); + } } if (use_xent) { From d9b8949714f059c3723416b97efb1b79ce8a3411 Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Mon, 2 Apr 2018 04:06:46 -0400 Subject: [PATCH 141/174] Fix bugs --- src/chain/chain-supervision-splitter.h | 2 +- src/chain/chain-supervision.cc | 59 +++++++------------ src/chain/chain-supervision.h | 5 +- src/chain/chain-training.cc | 10 ++-- src/chainbin/nnet3-chain-normalize-egs.cc | 10 ---- src/chainbin/nnet3-chain-split-and-get-egs.cc | 11 ++-- src/nnet3/nnet-chain-diagnostics.cc | 20 +------ src/nnet3/nnet-chain-example.cc | 9 +-- 8 files changed, 38 insertions(+), 88 deletions(-) diff --git a/src/chain/chain-supervision-splitter.h b/src/chain/chain-supervision-splitter.h index e0d2397466e..654a3fbbf24 100644 --- a/src/chain/chain-supervision-splitter.h +++ b/src/chain/chain-supervision-splitter.h @@ -128,7 +128,7 @@ class SupervisionLatticeSplitter { const Lattice& GetLattice() const { return lat_; } - const std::StdVectorFst& ToleranceFst() const { return tolerance_fst_; } + const fst::StdVectorFst& ToleranceFst() const { return tolerance_fst_; } private: // Creates an output lattice covering frames begin_frame <= t < end_frame, // assuming that the corresponding state-range that we need to diff --git a/src/chain/chain-supervision.cc b/src/chain/chain-supervision.cc index 768eccc01f1..834fff364be 100644 --- a/src/chain/chain-supervision.cc +++ b/src/chain/chain-supervision.cc @@ -606,9 +606,6 @@ void Supervision::Write(std::ostream &os, bool binary) const { if (numerator_post_targets.NumRows() > 0) { WriteToken(os, binary, ""); numerator_post_targets.Write(os, binary); - - WriteToken(os, binary, ""); - WriteBasicType(os, binary, numerator_log_prob); } if (binary == false) { // In text mode, write the FST without any compactification. @@ -651,7 +648,6 @@ void Supervision::Swap(Supervision *other) { std::swap(e2e, other->e2e); std::swap(e2e_fsts, other->e2e_fsts); std::swap(numerator_post_targets, other->numerator_post_targets); - std::swap(numerator_log_prob, other->numerator_log_prob); } void Supervision::Read(std::istream &is, bool binary) { @@ -677,7 +673,6 @@ void Supervision::Read(std::istream &is, bool binary) { if (PeekToken(is, binary) == 'N') { ExpectToken(is, binary, ""); - ReadBasicType(is, binary, &numerator_log_prob); } if (PeekToken(is, binary) == '/') { @@ -752,9 +747,9 @@ int32 ComputeFstStateTimes(const fst::StdVectorFst &fst, return total_length; } -Supervision::Supervision(int32 dim, const Posterior &labels, BaseFloat log_prob): +Supervision::Supervision(int32 dim, const Posterior &labels): weight(1.0), num_sequences(1), frames_per_sequence(labels.size()), - label_dim(dim), e2e(false), numerator_log_prob(log_prob) { + label_dim(dim), e2e(false) { SparseMatrix sparse_feats(dim, labels); numerator_post_targets = sparse_feats; } @@ -764,8 +759,7 @@ Supervision::Supervision(const Supervision &other): frames_per_sequence(other.frames_per_sequence), label_dim(other.label_dim), fst(other.fst), e2e(other.e2e), e2e_fsts(other.e2e_fsts), - numerator_post_targets(other.numerator_post_targets), - numerator_log_prob(other.numerator_log_prob) { } + numerator_post_targets(other.numerator_post_targets) { } // This static function is called by AppendSupervision if the supervisions @@ -786,39 +780,33 @@ void AppendSupervisionE2e(const std::vector &input, } void AppendSupervisionPost(const std::vector &input, - std::vector *output_supervision) { + Supervision *output_supervision) { KALDI_ASSERT(!input.empty()); int32 label_dim = input[0]->label_dim, num_inputs = input.size(); KALDI_ASSERT(num_inputs > 1); KALDI_ASSERT(input[0]->numerator_post_targets.NumRows() > 0); - KALDI_ASSERT(output_supervision->size() == 1); // otherwise not supported - KALDI_ASSERT((*output_supervision)[0].num_sequences == num_inputs); + KALDI_ASSERT(output_supervision->num_sequences == num_inputs); std::vector output_targets(num_inputs); output_targets[0] = &(input[0]->numerator_post_targets); - KALDI_ASSERT(kaldi::ApproxEqual( - (*output_supervision)[0].numerator_log_prob, - input[0]->numerator_log_prob)); - for (int32 i = 1; i < num_inputs; i++) { output_targets[i] = &(input[i]->numerator_post_targets); KALDI_ASSERT(output_targets[i]->NumRows() > 0); KALDI_ASSERT(output_targets[i]->NumCols() == label_dim); KALDI_ASSERT(input[i]->frames_per_sequence == - (*output_supervision)[0].frames_per_sequence); - (*output_supervision)[0].numerator_log_prob += input[i]->numerator_log_prob; + output_supervision->frames_per_sequence); } AppendGeneralMatrixRows( - output_targets, &((*output_supervision)[0].numerator_post_targets), + output_targets, &(output_supervision->numerator_post_targets), true); // sort by t - KALDI_ASSERT((*output_supervision)[0].numerator_post_targets.NumRows() - == (*output_supervision)[0].frames_per_sequence - * (*output_supervision)[0].num_sequences); - KALDI_ASSERT((*output_supervision)[0].frames_per_sequence * (*output_supervision)[0].num_sequences == (*output_supervision)[0].numerator_post_targets.NumRows()); + KALDI_ASSERT(output_supervision->numerator_post_targets.NumRows() + == output_supervision->frames_per_sequence + * output_supervision->num_sequences); + KALDI_ASSERT(output_supervision->frames_per_sequence * output_supervision->num_sequences == output_supervision->numerator_post_targets.NumRows()); } void AppendSupervision(const std::vector &input, @@ -856,26 +844,19 @@ void AppendSupervision(const std::vector &input, } } - KALDI_ASSERT(output_supervision->size() == 1); // otherwise not supported - KALDI_ASSERT(output_was_merged.size() == output_supervision->size()); - for (size_t i = 0; i < output_supervision->size(); i++) { - if (output_was_merged[i]) { - fst::StdVectorFst &out_fst = (*output_supervision)[i].fst; - // The process of concatenation will have introduced epsilons. - fst::RmEpsilon(&out_fst); - if (input[0]->numerator_post_targets.NumRows() > 0 && out_fst.Start() >= 0) - SortBreadthFirstSearch(&out_fst); - } - - } + fst::StdVectorFst &out_fst = output_supervision->fst; + // The process of concatenation will have introduced epsilons. + fst::RmEpsilon(&out_fst); + if (input[0]->numerator_post_targets.NumRows() > 0 && out_fst.Start() >= 0) + SortBreadthFirstSearch(&out_fst); if (input[0]->numerator_post_targets.NumRows() > 0) { AppendSupervisionPost(input, output_supervision); - KALDI_VLOG(2) << (*output_supervision)[0].frames_per_sequence << " * " - << (*output_supervision)[0].num_sequences << " == " - << (*output_supervision)[0].numerator_post_targets.NumRows(); + KALDI_VLOG(2) << output_supervision->frames_per_sequence << " * " + << output_supervision->num_sequences << " == " + << output_supervision->numerator_post_targets.NumRows(); - KALDI_ASSERT((*output_supervision)[0].frames_per_sequence * (*output_supervision)[0].num_sequences == (*output_supervision)[0].numerator_post_targets.NumRows()); + KALDI_ASSERT(output_supervision->frames_per_sequence * output_supervision->num_sequences == output_supervision->numerator_post_targets.NumRows()); } } diff --git a/src/chain/chain-supervision.h b/src/chain/chain-supervision.h index b3288d7baed..13878a4595c 100644 --- a/src/chain/chain-supervision.h +++ b/src/chain/chain-supervision.h @@ -265,12 +265,11 @@ struct Supervision { std::vector e2e_fsts; GeneralMatrix numerator_post_targets; - BaseFloat numerator_log_prob; Supervision(): weight(1.0), num_sequences(1), frames_per_sequence(-1), - label_dim(-1), e2e(false), numerator_log_prob(0.0) { } + label_dim(-1), e2e(false) { } - Supervision(int32 dim, const Posterior &labels, BaseFloat log_prob = 0.0); + Supervision(int32 dim, const Posterior &labels); Supervision(const Supervision &other); diff --git a/src/chain/chain-training.cc b/src/chain/chain-training.cc index 59a57e588bb..f54bb671312 100644 --- a/src/chain/chain-training.cc +++ b/src/chain/chain-training.cc @@ -287,18 +287,18 @@ void ComputeChainObjfAndDeriv(const ChainTrainingOptions &opts, } if (opts.kl_factor > 0.0) { + CuMatrix numerator_post(nnet_output.NumRows(), nnet_output.NumCols()); + supervision.numerator_post_targets.CopyToMat(&numerator_post); if (xent_output_deriv) { - CuMatrix numerator_post(nnet_output.NumRows(), nnet_output.NumCols()); - supervision.numerator_post_targets.CopyToMat(&numerator_post); xent_output_deriv->AddMat(supervision.weight * opts.kl_factor, numerator_post); if (nnet_output_deriv) nnet_output_deriv->AddMat(supervision.weight * opts.kl_factor, numerator_post); } else if (nnet_output_deriv) { - CuMatrix numerator_post(nnet_output.NumRows(), nnet_output.NumCols()); - supervision.numerator_post_targets.CopyToMat(&numerator_post); nnet_output_deriv->AddMat(supervision.weight * opts.kl_factor, numerator_post); } - num_logprob_weighted += opts.kl_factor * supervision.numerator_log_prob * supervision.weight; + + num_logprob_weighted += supervision.weight * opts.kl_factor * + TraceMatMat(nnet_output, numerator_post, kTrans); } *objf = num_logprob_weighted - den_logprob_weighted; diff --git a/src/chainbin/nnet3-chain-normalize-egs.cc b/src/chainbin/nnet3-chain-normalize-egs.cc index c387a745296..a97797e3246 100644 --- a/src/chainbin/nnet3-chain-normalize-egs.cc +++ b/src/chainbin/nnet3-chain-normalize-egs.cc @@ -63,21 +63,11 @@ int main(int argc, char *argv[]) { fst::StdVectorFst normalization_fst; ReadFstKaldi(normalization_fst_rxfilename, &normalization_fst); -<<<<<<< HEAD - if (scale < 0.0) { - KALDI_ERR << "Invalid scale on normalization FST; must be >= 0.0"; - } - - if (scale != 1.0) { - ApplyProbabilityScale(scale, &normalization_fst); - } -======= if (normalization_fst_scale < 0.0) KALDI_ERR << "Invalid scale on normalization FST; must be >= 0.0"; if (normalization_fst_scale != 1.0) ApplyProbabilityScale(normalization_fst_scale, &normalization_fst); ->>>>>>> e8b4f50d30df411bb156ff3927a41f20f6cffa99 SequentialNnetChainExampleReader example_reader(examples_rspecifier); NnetChainExampleWriter example_writer(examples_wspecifier); diff --git a/src/chainbin/nnet3-chain-split-and-get-egs.cc b/src/chainbin/nnet3-chain-split-and-get-egs.cc index af34639a0eb..4456c1d3b13 100644 --- a/src/chainbin/nnet3-chain-split-and-get-egs.cc +++ b/src/chainbin/nnet3-chain-split-and-get-egs.cc @@ -75,7 +75,7 @@ void ConvertLatticeToPdfLabels( } } -BaseFloat LatticeToNumeratorPost(const Lattice &lat, +void LatticeToNumeratorPost(const Lattice &lat, const TransitionModel &trans_model, const fst::StdVectorFst &normalization_fst, BaseFloat lm_scale, std::string key, @@ -104,7 +104,7 @@ BaseFloat LatticeToNumeratorPost(const Lattice &lat, KALDI_ERR << "Cycles detected in lattice."; } - return LatticeForwardBackward(lat_copy, post); + LatticeForwardBackward(lat_copy, post); } /** @@ -181,20 +181,17 @@ static bool ProcessFile(const chain::SupervisionOptions &sup_opts, if (include_numerator_post) { Posterior pdf_post; - supervision_part.numerator_log_prob = LatticeToNumeratorPost( + LatticeToNumeratorPost( *lat_part, trans_model, normalization_fst, sup_opts.lm_scale, utt_id, &pdf_post); KALDI_ASSERT(pdf_post.size() == num_frames_subsampled); Posterior check_post; - BaseFloat check_prob; if (GetVerboseLevel() >= 2) { - check_prob = LatticeToNumeratorPost( + LatticeToNumeratorPost( sup_lat_splitter.GetLattice(), trans_model, normalization_fst, sup_opts.lm_scale, utt_id, &check_post); } - KALDI_VLOG(2) << "log-prob=" << supervision_part.numerator_log_prob - << "; check-prob=" << check_prob; Posterior labels; labels.resize(num_frames_subsampled); diff --git a/src/nnet3/nnet-chain-diagnostics.cc b/src/nnet3/nnet-chain-diagnostics.cc index b7799f94ea8..1581ac51893 100644 --- a/src/nnet3/nnet-chain-diagnostics.cc +++ b/src/nnet3/nnet-chain-diagnostics.cc @@ -335,7 +335,7 @@ bool NnetChainComputeProb::PrintTotalStats() const { } -std::pair NnetChainComputeProb::GetTotalObjective() const { +double NnetChainComputeProb::GetTotalObjective(double *total_weight) const { unordered_map::const_iterator iter, end; iter = objf_info_.begin(); @@ -352,10 +352,10 @@ std::pair NnetChainComputeProb::GetTotalObjective() const tot_objf += like + aux_objfs.Sum(); tot_weight += info.tot_weight; } - return std::make_pair(tot_objf, tot_weight); + if (total_weight) *total_weight = tot_weight; + return tot_objf; } - const ChainObjectiveInfo* NnetChainComputeProb::GetObjective( const std::string &output_name) const { unordered_map::const_iterator @@ -366,20 +366,6 @@ const ChainObjectiveInfo* NnetChainComputeProb::GetObjective( return NULL; } -double NnetChainComputeProb::GetTotalObjective(double *total_weight) const { - double tot_objectives = 0.0; - double tot_weight = 0.0; - unordered_map::const_iterator - iter = objf_info_.begin(), end = objf_info_.end(); - for (; iter != end; ++iter) { - tot_objectives += iter->second.tot_like + iter->second.tot_l2_term; - tot_weight += iter->second.tot_weight; - } - - if (total_weight) *total_weight = tot_weight; - return tot_objectives; -} - static bool HasXentOutputs(const Nnet &nnet) { const std::vector node_names = nnet.GetNodeNames(); for (std::vector::const_iterator it = node_names.begin(); diff --git a/src/nnet3/nnet-chain-example.cc b/src/nnet3/nnet-chain-example.cc index 31647b1f3ce..c808e3a3fc3 100644 --- a/src/nnet3/nnet-chain-example.cc +++ b/src/nnet3/nnet-chain-example.cc @@ -212,13 +212,10 @@ static void MergeSupervision( AppendSupervision(input_supervision, &output_supervision); - if (output_supervision[0].numerator_post_targets.NumRows() > 0) - KALDI_ASSERT(output_supervision[0].frames_per_sequence * output_supervision[0].num_sequences == output_supervision[0].numerator_post_targets.NumRows()); + if (output_supervision.numerator_post_targets.NumRows() > 0) + KALDI_ASSERT(output_supervision.frames_per_sequence * output_supervision.num_sequences == output_supervision.numerator_post_targets.NumRows()); - if (output_supervision.size() != 1) - KALDI_ERR << "Failed to merge 'chain' examples-- inconsistent lengths " - << "or weights?"; - output->supervision.Swap(&(output_supervision[0])); + output->supervision.Swap(&output_supervision); if (output->supervision.numerator_post_targets.NumRows() > 0) KALDI_ASSERT(output->supervision.frames_per_sequence * output->supervision.num_sequences == output->supervision.numerator_post_targets.NumRows()); From a7144a8231997a09fe813ba89089d53fca423a09 Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Mon, 2 Apr 2018 15:36:54 -0400 Subject: [PATCH 142/174] Bug fix --- src/chain/chain-supervision-test.cc | 2 +- src/chain/chain-supervision.cc | 38 +++++++++-------------------- 2 files changed, 12 insertions(+), 28 deletions(-) diff --git a/src/chain/chain-supervision-test.cc b/src/chain/chain-supervision-test.cc index bded9836415..08858bddaab 100644 --- a/src/chain/chain-supervision-test.cc +++ b/src/chain/chain-supervision-test.cc @@ -104,7 +104,7 @@ void TestSupervisionNumerator(const Supervision &supervision) { CuMatrix nnet_output_deriv(nnet_output.NumRows(), nnet_output.NumCols()); - num.Backward(&nnet_output_deriv); + num.Backward(1.0, &nnet_output_deriv); int32 dim = 3; Vector predicted_objf_changes(dim), diff --git a/src/chain/chain-supervision.cc b/src/chain/chain-supervision.cc index 834fff364be..50f797bfbd4 100644 --- a/src/chain/chain-supervision.cc +++ b/src/chain/chain-supervision.cc @@ -603,10 +603,6 @@ void Supervision::Write(std::ostream &os, bool binary) const { WriteToken(os, binary, ""); WriteBasicType(os, binary, e2e); if (!e2e) { - if (numerator_post_targets.NumRows() > 0) { - WriteToken(os, binary, ""); - numerator_post_targets.Write(os, binary); - } if (binary == false) { // In text mode, write the FST without any compactification. WriteFstKaldi(os, binary, fst); @@ -636,6 +632,10 @@ void Supervision::Write(std::ostream &os, bool binary) const { } WriteToken(os, binary, ""); } + if (numerator_post_targets.NumRows() > 0) { + WriteToken(os, binary, ""); + numerator_post_targets.Write(os, binary); + } WriteToken(os, binary, ""); } @@ -667,19 +667,6 @@ void Supervision::Read(std::istream &is, bool binary) { e2e = false; } if (!e2e) { - if (PeekToken(is, binary) == 'N') { - ExpectToken(is, binary, ""); - numerator_post_targets.Read(is, binary); - - if (PeekToken(is, binary) == 'N') { - ExpectToken(is, binary, ""); - } - - if (PeekToken(is, binary) == '/') { - ExpectToken(is, binary, ""); - return; - } - } if (!binary) { ReadFstKaldi(is, binary, &fst); } else { @@ -709,7 +696,12 @@ void Supervision::Read(std::istream &is, bool binary) { } ExpectToken(is, binary, ""); } - ExpectToken(is, binary, ""); + if (PeekToken(is, binary) == 'N') { + ExpectToken(is, binary, ""); + numerator_post_targets.Read(is, binary); + } else { + ExpectToken(is, binary, ""); + } } int32 ComputeFstStateTimes(const fst::StdVectorFst &fst, @@ -747,13 +739,6 @@ int32 ComputeFstStateTimes(const fst::StdVectorFst &fst, return total_length; } -Supervision::Supervision(int32 dim, const Posterior &labels): - weight(1.0), num_sequences(1), frames_per_sequence(labels.size()), - label_dim(dim), e2e(false) { - SparseMatrix sparse_feats(dim, labels); - numerator_post_targets = sparse_feats; -} - Supervision::Supervision(const Supervision &other): weight(other.weight), num_sequences(other.num_sequences), frames_per_sequence(other.frames_per_sequence), @@ -847,8 +832,7 @@ void AppendSupervision(const std::vector &input, fst::StdVectorFst &out_fst = output_supervision->fst; // The process of concatenation will have introduced epsilons. fst::RmEpsilon(&out_fst); - if (input[0]->numerator_post_targets.NumRows() > 0 && out_fst.Start() >= 0) - SortBreadthFirstSearch(&out_fst); + SortBreadthFirstSearch(&out_fst); if (input[0]->numerator_post_targets.NumRows() > 0) { AppendSupervisionPost(input, output_supervision); From 88c03ec6c79e0ae0897c78a99c0432a4be3fdb4a Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Mon, 2 Apr 2018 15:47:27 -0400 Subject: [PATCH 143/174] Minor bug fixes --- egs/wsj/s5/steps/nnet3/chain/get_egs_split.sh | 28 ++++++++----- src/chain/chain-supervision-splitter-test.cc | 9 ++++- src/chainbin/nnet3-chain-normalize-egs.cc | 10 ----- src/chainbin/nnet3-chain-split-and-get-egs.cc | 2 +- src/nnet3/nnet-chain-diagnostics.cc | 40 ++++++------------- 5 files changed, 39 insertions(+), 50 deletions(-) diff --git a/egs/wsj/s5/steps/nnet3/chain/get_egs_split.sh b/egs/wsj/s5/steps/nnet3/chain/get_egs_split.sh index e0fd6b5c01a..e185c186218 100755 --- a/egs/wsj/s5/steps/nnet3/chain/get_egs_split.sh +++ b/egs/wsj/s5/steps/nnet3/chain/get_egs_split.sh @@ -46,7 +46,7 @@ frames_per_iter=400000 # each iteration of training, see this many frames per # used. This is just a guideline; it will pick a number # that divides the number of samples in the entire data. -right_tolerance= #CTC right tolerance == max label delay. +right_tolerance= # chain right tolerance == max label delay. left_tolerance= right_tolerance_silence= # Tolerances for silence phones @@ -55,7 +55,7 @@ left_tolerance_silence= transform_dir= # If supplied, overrides latdir as the place to find fMLLR transforms stage=0 -max_jobs_run=15 # This should be set to the maximum number of jobs you are +max_jobs_run=15 # This should be set to the maximum number of nnet3-chain-get-egs jobs you are # comfortable to run in parallel; you can increase it if your disk # speed is greater and you have more machines. max_shuffle_jobs_run=50 # the shuffle jobs now include the nnet3-chain-normalize-egs command, @@ -68,9 +68,10 @@ cmvn_opts= # can be used for specifying CMVN options, if feature type is not ld # LDA transform). This is used to turn off CMVN in the online-nnet experiments. lattice_lm_scale= # If supplied, the graph/lm weight of the lattices will be # used (with this scale) in generating supervisions -egs_weight=1.0 # The weight which determines how much each training example - # contributes to gradients while training (can be used - # to down/up-weight a dataset) + # This is 0 by default for conventional supervised training, + # but may be close to 1 for the unsupervised part of the data + # in semi-supervised training. The optimum is usually + # 0.5 for unsupervised data. lattice_prune_beam= # If supplied, the lattices will be pruned to this beam, # before being used to get supervisions. acwt=0.1 # For pruning @@ -110,6 +111,14 @@ if [ $# != 4 ]; then echo " --num-egs-diagnostic <#frames;4000> # Number of egs used in computing (train,valid) diagnostics" echo " --num-valid-egs-combine <#frames;10000> # Number of egs used in getting combination weights at the" echo " # very end." + echo " --lattice-lm-scale # If supplied, the graph/lm weight of the lattices will be " + echo " # used (with this scale) in generating supervisions" + echo " --lattice-prune-beam # If supplied, the lattices will be pruned to this beam, " + echo " # before being used to get supervisions." + echo " --acwt # Acoustic scale -- affects pruning" + echo " --deriv-weights-scp # If supplied, adds per-frame weights to the supervision." + echo " --generate-egs-scp # Generates scp files -- Required if the egs will be " + echo " # used for multilingual/multitask training." echo " --stage # Used to run a partially-completed training process from somewhere in" echo " # the middle." @@ -286,7 +295,7 @@ chain_supervision_all_opts="--supervision.frame-subsampling-factor=$alignment_su [ ! -z $left_tolerance ] && \ chain_supervision_all_opts="$chain_supervision_all_opts --supervision.left-tolerance=$left_tolerance" -normalization_scale=1.0 +normalization_fst_scale=1.0 lats_rspecifier="ark:gunzip -c $latdir/lat.JOB.gz |" if [ ! -z $lattice_prune_beam ]; then @@ -300,7 +309,7 @@ fi if [ ! -z "$lattice_lm_scale" ]; then chain_supervision_all_opts="$chain_supervision_all_opts --supervision.lm-scale=$lattice_lm_scale" - normalization_scale=$(perl -e " + normalization_fst_scale=$(perl -e " if ($lattice_lm_scale >= 1.0 || $lattice_lm_scale < 0) { print STDERR \"Invalid --lattice-lm-scale $lattice_lm_scale\"; exit(1); @@ -418,7 +427,6 @@ if [ $stage -le 4 ]; then lattice-align-phones --write-compact=false --replace-output-symbols=true $latdir/final.mdl \ "$lats_rspecifier" ark:- \| \ nnet3-chain-split-and-get-egs $chain_supervision_all_opts \ - --supervision.weight=$egs_weight \ $ivector_opts --srand=\$[JOB+$srand] $egs_opts \ --num-frames-overlap=$frames_overlap_per_eg \ "$feats" $chaindir/tree $chaindir/0.trans_mdl \ @@ -444,7 +452,7 @@ if [ $stage -le 5 ]; then output_archive="ark:$dir/cegs.JOB.ark" fi $cmd --max-jobs-run $max_shuffle_jobs_run --mem 8G JOB=1:$num_archives_intermediate $dir/log/shuffle.JOB.log \ - nnet3-chain-normalize-egs --normalization-scale=$normalization_scale $chaindir/normalization.fst "ark:cat $egs_list|" ark:- \| \ + nnet3-chain-normalize-egs --normalization-fst-scale=$normalization_fst_scale $chaindir/normalization.fst "ark:cat $egs_list|" ark:- \| \ nnet3-chain-shuffle-egs --srand=\$[JOB+$srand] ark:- $output_archive || exit 1; if $generate_egs_scp; then @@ -474,7 +482,7 @@ if [ $stage -le 5 ]; then done done $cmd --max-jobs-run $max_shuffle_jobs_run --mem 8G JOB=1:$num_archives_intermediate $dir/log/shuffle.JOB.log \ - nnet3-chain-normalize-egs --normalization-scale=$normalization_scale $chaindir/normalization.fst "ark:cat $egs_list|" ark:- \| \ + nnet3-chain-normalize-egs --normalization-fst-scale=$normalization_fst_scale $chaindir/normalization.fst "ark:cat $egs_list|" ark:- \| \ nnet3-chain-shuffle-egs --srand=\$[JOB+$srand] ark:- ark:- \| \ nnet3-chain-copy-egs ark:- $output_archives || exit 1; diff --git a/src/chain/chain-supervision-splitter-test.cc b/src/chain/chain-supervision-splitter-test.cc index ccc6cdabbfd..6664478a482 100644 --- a/src/chain/chain-supervision-splitter-test.cc +++ b/src/chain/chain-supervision-splitter-test.cc @@ -274,7 +274,7 @@ int main(int argc, char *argv[]) { ParseOptions po(usage); - int32 num_phones = 1; + int32 num_phones = 2; po.Register("num-phones", &num_phones, "Number of phones"); @@ -289,7 +289,14 @@ int main(int argc, char *argv[]) { po.Read(argc, argv); + sup_opts.left_tolerance = 1; + sup_opts.right_tolerance = 1; kaldi::chain::TestToleranceFst(sup_opts, num_phones); + + sup_opts.left_tolerance = 0; + sup_opts.right_tolerance = 0; + kaldi::chain::TestToleranceFst(sup_opts, num_phones); + return 0; for (int32 i = 0; i < 10; i++) { diff --git a/src/chainbin/nnet3-chain-normalize-egs.cc b/src/chainbin/nnet3-chain-normalize-egs.cc index e7b5eed045c..a97797e3246 100644 --- a/src/chainbin/nnet3-chain-normalize-egs.cc +++ b/src/chainbin/nnet3-chain-normalize-egs.cc @@ -63,21 +63,11 @@ int main(int argc, char *argv[]) { fst::StdVectorFst normalization_fst; ReadFstKaldi(normalization_fst_rxfilename, &normalization_fst); -<<<<<<< HEAD - if (scale < 0.0) { - KALDI_ERR << "Invalid scale on normalization FST; must be >= 0.0"; - } - - if (scale != 1.0) { - ScaleFst(scale, &normalization_fst); - } -======= if (normalization_fst_scale < 0.0) KALDI_ERR << "Invalid scale on normalization FST; must be >= 0.0"; if (normalization_fst_scale != 1.0) ApplyProbabilityScale(normalization_fst_scale, &normalization_fst); ->>>>>>> e8b4f50d30df411bb156ff3927a41f20f6cffa99 SequentialNnetChainExampleReader example_reader(examples_rspecifier); NnetChainExampleWriter example_writer(examples_wspecifier); diff --git a/src/chainbin/nnet3-chain-split-and-get-egs.cc b/src/chainbin/nnet3-chain-split-and-get-egs.cc index 384433fac69..e07bcdc12c9 100644 --- a/src/chainbin/nnet3-chain-split-and-get-egs.cc +++ b/src/chainbin/nnet3-chain-split-and-get-egs.cc @@ -295,7 +295,7 @@ int main(int argc, char *argv[]) { } if (sup_opts.lm_scale != 0.0) { - ScaleFst(1.0 - sup_opts.lm_scale, &normalization_fst); + fst::ApplyProbabilityScale(1.0 - sup_opts.lm_scale, &normalization_fst); } } diff --git a/src/nnet3/nnet-chain-diagnostics.cc b/src/nnet3/nnet-chain-diagnostics.cc index 5f703798fe7..9e659533bf9 100644 --- a/src/nnet3/nnet-chain-diagnostics.cc +++ b/src/nnet3/nnet-chain-diagnostics.cc @@ -346,27 +346,6 @@ bool NnetChainComputeProb::PrintTotalStats() const { } -std::pair NnetChainComputeProb::GetTotalObjective() const { - unordered_map::const_iterator - iter, end; - iter = objf_info_.begin(); - end = objf_info_.end(); - BaseFloat tot_objf = 0.0, tot_weight = 0.0; - for (; iter != end; ++iter) { - const std::string &name = iter->first; - int32 node_index = nnet_.GetNodeIndex(name); - KALDI_ASSERT(node_index >= 0); - const ChainObjectiveInfo &info = iter->second; - BaseFloat like = (info.tot_like / info.tot_weight); - ObjectiveValues aux_objfs(info.tot_aux_objfs); - aux_objfs.Scale(info.tot_weight); - tot_objf += like + aux_objfs.Sum(); - tot_weight += info.tot_weight; - } - return std::make_pair(tot_objf, tot_weight); -} - - const ChainObjectiveInfo* NnetChainComputeProb::GetObjective( const std::string &output_name) const { unordered_map::const_iterator @@ -378,17 +357,22 @@ const ChainObjectiveInfo* NnetChainComputeProb::GetObjective( } double NnetChainComputeProb::GetTotalObjective(double *total_weight) const { - double tot_objectives = 0.0; - double tot_weight = 0.0; unordered_map::const_iterator - iter = objf_info_.begin(), end = objf_info_.end(); + iter, end; + iter = objf_info_.begin(); + end = objf_info_.end(); + BaseFloat tot_objf = 0.0, tot_weight = 0.0; for (; iter != end; ++iter) { - tot_objectives += iter->second.tot_like + iter->second.tot_l2_term; - tot_weight += iter->second.tot_weight; + const ChainObjectiveInfo &info = iter->second; + BaseFloat like = (info.tot_like / info.tot_weight); + ObjectiveValues aux_objfs(info.tot_aux_objfs); + aux_objfs.Scale(info.tot_weight); + tot_objf += like + aux_objfs.Sum(); + tot_weight += info.tot_weight; } - if (total_weight) *total_weight = tot_weight; - return tot_objectives; + if(total_weight) *total_weight = tot_weight; + return tot_objf; } static bool HasXentOutputs(const Nnet &nnet) { From dfb891feb51552e5c4b99e3cce580477a91dd9da Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Mon, 2 Apr 2018 17:08:34 -0400 Subject: [PATCH 144/174] Minor bug fixes --- src/chain/chain-supervision.cc | 4 ++-- src/chainbin/Makefile | 3 +-- src/nnet3/nnet-chain-example.cc | 3 ++- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/chain/chain-supervision.cc b/src/chain/chain-supervision.cc index 50f797bfbd4..243edaacf00 100644 --- a/src/chain/chain-supervision.cc +++ b/src/chain/chain-supervision.cc @@ -699,9 +699,9 @@ void Supervision::Read(std::istream &is, bool binary) { if (PeekToken(is, binary) == 'N') { ExpectToken(is, binary, ""); numerator_post_targets.Read(is, binary); - } else { - ExpectToken(is, binary, ""); } + + ExpectToken(is, binary, ""); } int32 ComputeFstStateTimes(const fst::StdVectorFst &fst, diff --git a/src/chainbin/Makefile b/src/chainbin/Makefile index f3d125e2aa0..dcd57ffec2a 100644 --- a/src/chainbin/Makefile +++ b/src/chainbin/Makefile @@ -13,8 +13,7 @@ BINFILES = chain-est-phone-lm chain-get-supervision chain-make-den-fst \ nnet3-chain-combine nnet3-chain-normalize-egs \ nnet3-chain-e2e-get-egs \ nnet3-chain-split-and-get-egs chain-split-lattices \ - nnet3-chain-split-convert-and-get-egs \ - nnet3-chain-get-egs-post + nnet3-chain-split-convert-and-get-egs OBJFILES = diff --git a/src/nnet3/nnet-chain-example.cc b/src/nnet3/nnet-chain-example.cc index c808e3a3fc3..561fda7285b 100644 --- a/src/nnet3/nnet-chain-example.cc +++ b/src/nnet3/nnet-chain-example.cc @@ -54,8 +54,9 @@ void NnetChainSupervision::Read(std::istream &is, bool binary) { KALDI_ASSERT(token == "" || token == ""); if (token == "") ReadVectorAsChar(is, binary, &deriv_weights); - else + else { deriv_weights.Read(is, binary); + } ExpectToken(is, binary, ""); } CheckDim(); From 975c130d9548998d0720ed8de5fa39f7fff0e621 Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Tue, 3 Apr 2018 18:14:14 -0400 Subject: [PATCH 145/174] Fix issues related to egs --- .../nnet3/train/chain_objf/acoustic_model.py | 1 - .../nnet3/chain/multilingual/combine_egs.sh | 66 ++++++++++++++++--- .../steps/nnet3/multilingual/combine_egs.sh | 55 ++++++++++++++-- src/latbin/lattice-compose.cc | 1 + 4 files changed, 110 insertions(+), 13 deletions(-) diff --git a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py index 468159b11b2..f5bf3b2d63f 100644 --- a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py +++ b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py @@ -203,7 +203,6 @@ def train_new_models(dir, iter, srand, num_jobs, --srand={srand} \ "{raw_model}" {dir}/den.fst \ "ark,bg:nnet3-chain-copy-egs {multitask_egs_opts} \ - --truncate-deriv-weights={trunc_deriv} \ --frame-shift={fr_shft} \ {scp_or_ark}:{egs_dir}/cegs.{archive_index}.{scp_or_ark} ark:- | \ nnet3-chain-shuffle-egs --buffer-size={buf_size} \ diff --git a/egs/wsj/s5/steps/nnet3/chain/multilingual/combine_egs.sh b/egs/wsj/s5/steps/nnet3/chain/multilingual/combine_egs.sh index 76793e8fa25..b1738a2dc83 100755 --- a/egs/wsj/s5/steps/nnet3/chain/multilingual/combine_egs.sh +++ b/egs/wsj/s5/steps/nnet3/chain/multilingual/combine_egs.sh @@ -28,6 +28,11 @@ block_size=256 # This is the number of consecutive egs that we take fro # access. lang2weight= # array of weights one per input languge to scale example's output # w.r.t its input language during training. +lang2num_copies= # comma-separated list of number of copies per + # input language + # This is another way to scale the effect of + # a langauge especially when the language has + # relatively very little data. stage=0 echo "$0 $@" # Print the command line for logging @@ -67,6 +72,15 @@ if [ ${#args[@]} != $[$num_langs+1] ]; then exit 1; fi +num_copies_per_lang= +if [ ! -z "$lang2num_copies" ]; then + IFS=, read -r -a num_copies_per_lang <<< $lang2num_copies + if [ ${#num_copies_per_lang[@]} -ne $num_langs ]; then + echo "$0: --lang2num-copies must be an array of num-langs=$num_langs integers" + exit 1 + fi +fi + required="cegs.scp combine.scp train_diagnostic.scp valid_diagnostic.scp" train_scp_list= train_diagnostic_scp_list= @@ -75,7 +89,7 @@ combine_scp_list= # read paramter from $egs_dir[0]/info and cmvn_opts # to write in multilingual egs_dir. -check_params="info/feat_dim info/ivector_dim info/left_context info/right_context cmvn_opts" +check_params="info/feat_dim info/ivector_dim info/left_context info/right_context cmvn_opts info/left_context_initial info/right_context_final" ivec_dim=`cat ${args[0]}/info/ivector_dim` if [ $ivec_dim -ne 0 ];then check_params="$check_params info/final.ie.id"; fi @@ -91,12 +105,48 @@ for lang in $(seq 0 $[$num_langs-1]);do echo "$0: no such file ${multi_egs_dir[$lang]}/$f." && exit 1; fi done - num_archives=$(cat ${multi_egs_dir[$lang]}/info/num_archives) + + if [ -z "$lang2num_copies" ] || [ ${num_copies_per_lang[$lang]} -eq 1 ]; then + train_scp_list="$train_scp_list ${multi_egs_dir[$lang]}/cegs.scp" + train_diagnostic_scp_list="$train_diagnostic_scp_list ${multi_egs_dir[$lang]}/train_diagnostic.scp" + valid_diagnostic_scp_list="$valid_diagnostic_scp_list ${multi_egs_dir[$lang]}/valid_diagnostic.scp" + combine_scp_list="$combine_scp_list ${multi_egs_dir[$lang]}/combine.scp" + num_archives=$(cat ${multi_egs_dir[$lang]}/info/num_archives) + else + rm -f $megs_dir/lang${lang}_cegs.scp $megs_dir/lang${lang}_train_diagnostic.scp \ + $megs_dir/lang${lang}_valid_diagnostic.scp $megs_dir/lang${lang}_combine.scp + + if [ $(perl -e "{print int(${num_copies_per_lang[$lang]})}") != ${num_copies_per_lang[$lang]} ]; then + echo "$0: Expected --lang2num-copies to have only integers; " + echo "$0: got ${num_copies_per_lang[$lang]} for language $lang" + exit 1 + fi + + for i in `seq ${num_copies_per_lang[$lang]}`; do + awk -v i=$i '{print $1"-"i" "$2}' ${multi_egs_dir[$lang]}/cegs.scp >> \ + $megs_dir/lang${lang}_cegs.scp + awk -v i=$i '{print $1"-"i" "$2}' ${multi_egs_dir[$lang]}/train_diagnostic.scp >> \ + $megs_dir/lang${lang}_train_diagnostic.scp + awk -v i=$i '{print $1"-"i" "$2}' ${multi_egs_dir[$lang]}/valid_diagnostic.scp >> \ + $megs_dir/lang${lang}_valid_diagnostic.scp + awk -v i=$i '{print $1"-"i" "$2}' ${multi_egs_dir[$lang]}/combine.scp >> \ + $megs_dir/lang${lang}_combine.scp + done + + if [ $(head -n1 $megs_dir/lang${lang}_cegs.scp | wc -w) -ne 2 ]; then + echo "$0: Incorrect format in $megs_dir/lang${lang}_cegs.scp; something went wrong!" + exit 1 + fi + + train_scp_list="$train_scp_list $megs_dir/lang${lang}_cegs.scp" + train_diagnostic_scp_list="$train_diagnostic_scp_list $megs_dir/lang${lang}_train_diagnostic.scp" + valid_diagnostic_scp_list="$valid_diagnostic_scp_list $megs_dir/lang${lang}_valid_diagnostic.scp" + combine_scp_list="$combine_scp_list $megs_dir/lang${lang}_combine.scp" + + num_archives=$(cat ${multi_egs_dir[$lang]}/info/num_archives) + num_archives=$[num_archives * ${num_copies_per_lang[$lang]}] + fi tot_num_archives=$[tot_num_archives+num_archives] - train_scp_list="$train_scp_list ${args[$lang]}/cegs.scp" - train_diagnostic_scp_list="$train_diagnostic_scp_list ${args[$lang]}/train_diagnostic.scp" - valid_diagnostic_scp_list="$valid_diagnostic_scp_list ${args[$lang]}/valid_diagnostic.scp" - combine_scp_list="$combine_scp_list ${args[$lang]}/combine.scp" # check parameter dimension to be the same in all egs dirs for f in $check_params; do @@ -163,6 +213,6 @@ for egs_type in combine train_diagnostic valid_diagnostic; do mv $megs_dir/${egs_type}.weight.1.ark $megs_dir/${egs_type}.weight.ark || exit 1; mv $megs_dir/${egs_type}.1.scp $megs_dir/${egs_type}.scp || exit 1; done -mv $megs_dir/info/cegs.num_archives $megs_dir/info/num_archives || exit 1; -mv $megs_dir/info/cegs.num_tasks $megs_dir/info/num_tasks || exit 1; +echo $tot_num_archives > $megs_dir/info/num_archives || exit 1; +echo $num_langs > $megs_dir/info/num_tasks || exit 1; echo "$0: Finished preparing multilingual training example." diff --git a/egs/wsj/s5/steps/nnet3/multilingual/combine_egs.sh b/egs/wsj/s5/steps/nnet3/multilingual/combine_egs.sh index e1aeb0b70d6..33ca39bbb11 100755 --- a/egs/wsj/s5/steps/nnet3/multilingual/combine_egs.sh +++ b/egs/wsj/s5/steps/nnet3/multilingual/combine_egs.sh @@ -24,6 +24,11 @@ block_size=256 # This is the number of consecutive egs that we take fro # access. lang2weight= # array of weights one per input languge to scale example's output # w.r.t its input language during training. +lang2num_copies= # comma-separated list of number of copies per + # input language + # This is another way to scale the effect of + # a langauge especially when the language has + # relatively very little data. stage=0 echo "$0 $@" # Print the command line for logging @@ -63,6 +68,15 @@ if [ ${#args[@]} != $[$num_langs+1] ]; then exit 1; fi +num_copies_per_lang= +if [ ! -z "$lang2num_copies" ]; then + IFS=, read -r -a num_copies_per_lang <<< $lang2num_copies + if [ ${#num_copies_per_lang[@]} -ne $num_langs ]; then + echo "$0: --lang2num-copies must be an array of num-langs=$num_langs integers" + exit 1 + fi +fi + required="egs.scp combine.scp train_diagnostic.scp valid_diagnostic.scp" train_scp_list= train_diagnostic_scp_list= @@ -87,12 +101,45 @@ for lang in $(seq 0 $[$num_langs-1]);do echo "$0: no such file ${multi_egs_dir[$lang]}/$f." && exit 1; fi done + + if [ -z "$lang2num_copies" ] || [ ${num_copies_per_lang[$lang]} -eq 1 ]; then + train_scp_list="$train_scp_list ${multi_egs_dir[$lang]}/egs.scp" + train_diagnostic_scp_list="$train_diagnostic_scp_list ${multi_egs_dir[$lang]}/train_diagnostic.scp" + valid_diagnostic_scp_list="$valid_diagnostic_scp_list ${multi_egs_dir[$lang]}/valid_diagnostic.scp" + combine_scp_list="$combine_scp_list ${multi_egs_dir[$lang]}/combine.scp" + else + rm -f $megs_dir/lang${lang}_egs.scp $megs_dir/lang${lang}_train_diagnostic.scp \ + $megs_dir/lang${lang}_valid_diagnostic.scp $megs_dir/lang${lang}_combine.scp + + if [ $(perl -e "{print int(${num_copies_per_lang[$lang]})}") != ${num_copies_per_lang[$lang]} ]; then + echo "$0: Expected --lang2num-copies to have only integers; " + echo "$0: got ${num_copies_per_lang[$lang]} for language $lang" + exit 1 + fi + + for i in `seq ${num_copies_per_lang[$lang]}`; do + awk -v i=$i '{print $1"-"i" "$2}' ${multi_egs_dir[$lang]}/egs.scp >> \ + $megs_dir/lang${lang}_egs.scp + awk -v i=$i '{print $1"-"i" "$2}' ${multi_egs_dir[$lang]}/train_diagnostic.scp >> \ + $megs_dir/lang${lang}_train_diagnostic.scp + awk -v i=$i '{print $1"-"i" "$2}' ${multi_egs_dir[$lang]}/valid_diagnostic.scp >> \ + $megs_dir/lang${lang}_valid_diagnostic.scp + awk -v i=$i '{print $1"-"i" "$2}' ${multi_egs_dir[$lang]}/combine.scp >> \ + $megs_dir/lang${lang}_combine.scp + done + + if [ $(head -n1 $megs_dir/lang${lang}_egs.scp | wc -w) -ne 2 ]; then + echo "$0: Incorrect format in $megs_dir/lang${lang}_egs.scp; something went wrong!" + exit 1 + fi + + train_scp_list="$train_scp_list $megs_dir/lang${lang}_egs.scp" + train_diagnostic_scp_list="$train_diagnostic_scp_list $megs_dir/lang${lang}_train_diagnostic.scp" + valid_diagnostic_scp_list="$valid_diagnostic_scp_list $megs_dir/lang${lang}_valid_diagnostic.scp" + combine_scp_list="$combine_scp_list $megs_dir/lang${lang}_combine.scp" + fi num_archives=$(cat ${multi_egs_dir[$lang]}/info/num_archives) tot_num_archives=$[tot_num_archives+num_archives] - train_scp_list="$train_scp_list ${args[$lang]}/egs.scp" - train_diagnostic_scp_list="$train_diagnostic_scp_list ${args[$lang]}/train_diagnostic.scp" - valid_diagnostic_scp_list="$valid_diagnostic_scp_list ${args[$lang]}/valid_diagnostic.scp" - combine_scp_list="$combine_scp_list ${args[$lang]}/combine.scp" # check parameter dimension to be the same in all egs dirs for f in $check_params; do diff --git a/src/latbin/lattice-compose.cc b/src/latbin/lattice-compose.cc index df70229bfd8..ad53c534c96 100644 --- a/src/latbin/lattice-compose.cc +++ b/src/latbin/lattice-compose.cc @@ -166,6 +166,7 @@ int main(int argc, char *argv[]) { ConvertLattice(lat_out, &clat_out); compact_lattice_writer.Write(key, clat_out); } else { + fst::TopSort(&lat_out); lattice_writer.Write(key, lat_out); } n_done++; From cbd87b7d0ed958d0e6fca17a2fd460f8898a97a4 Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Wed, 11 Apr 2018 17:57:01 -0400 Subject: [PATCH 146/174] Changes to KL training --- egs/wsj/s5/steps/nnet3/chain/get_egs_split.sh | 17 +- .../nnet3/chain/make_weighted_den_fst.sh | 34 +-- src/chainbin/Makefile | 2 +- src/chainbin/chain-lattice-to-post.cc | 201 ++++++++++++++++++ src/chainbin/nnet3-chain-split-and-get-egs.cc | 67 +++--- src/nnet3/nnet-chain-diagnostics.cc | 9 +- src/nnet3/nnet-chain-training.cc | 9 +- 7 files changed, 279 insertions(+), 60 deletions(-) create mode 100644 src/chainbin/chain-lattice-to-post.cc diff --git a/egs/wsj/s5/steps/nnet3/chain/get_egs_split.sh b/egs/wsj/s5/steps/nnet3/chain/get_egs_split.sh index 47c01fc5086..85d07f7f8d9 100755 --- a/egs/wsj/s5/steps/nnet3/chain/get_egs_split.sh +++ b/egs/wsj/s5/steps/nnet3/chain/get_egs_split.sh @@ -74,6 +74,7 @@ lattice_lm_scale= # If supplied, the graph/lm weight of the lattices will be # 0.5 for unsupervised data. lattice_prune_beam= # If supplied, the lattices will be pruned to this beam, # before being used to get supervisions. +kl_fst_scale= acwt=0.1 # For pruning phone_insertion_penalty= deriv_weights_scp= @@ -297,12 +298,12 @@ chain_supervision_all_opts="--supervision.frame-subsampling-factor=$alignment_su chain_supervision_all_opts="$chain_supervision_all_opts --supervision.left-tolerance=$left_tolerance" if $include_numerator_post; then - chain_supervision_all_opts="$chain_supervision_all_opts --include-numerator-post" + chain_supervision_all_opts="$chain_supervision_all_opts" fi normalization_fst_scale=1.0 -lats_rspecifier="ark:gunzip -c $latdir/lat.JOB.gz |" +lats_rspecifier="ark,s,cs:gunzip -c $latdir/lat.JOB.gz |" if [ ! -z $lattice_prune_beam ]; then if [ "$lattice_prune_beam" == "0" ] || [ "$lattice_prune_beam" == "0.0" ]; then lats_rspecifier="$lats_rspecifier lattice-1best --acoustic-scale=$acwt ark:- ark:- |" @@ -322,6 +323,15 @@ if [ ! -z "$lattice_lm_scale" ]; then print (1.0 - $lattice_lm_scale);") fi +if [ -z $kl_fst_scale ]; then + kl_fst_scale=$normalization_fst_scale +fi + +graph_posterior_rspecifier= +if $include_numerator_post; then + graph_posterior_rspecifier="$lats_rspecifier chain-lattice-to-post --acoustic-scale=$acwt --fst-scale=$kl_fst_scale $chaindir/den.fst $chaindir/0.trans_mdl ark:- ark:- |" +fi + [ ! -z $phone_insertion_penalty ] && \ chain_supervision_all_opts="$chain_supervision_all_opts --supervision.phone-ins-penalty=$phone_insertion_penalty" @@ -358,6 +368,7 @@ if [ $stage -le 2 ]; then utils/filter_scp.pl $dir/valid_uttlist $dir/lat_special.scp \| \ lattice-align-phones --write-compact=false --replace-output-symbols=true $latdir/final.mdl scp:- ark:- \| \ nnet3-chain-split-and-get-egs $chain_supervision_all_opts $ivector_opts --srand=$srand \ + ${graph_posterior_rspecifier:+--graph-posterior-rspecifier="ark,s,cs:utils/filter_scp.pl $dir/valid_uttlist $dir/lat_special.scp | chain-lattice-to-post --acoustic-scale=$acwt --fst-scale=$kl_fst_scale $chaindir/den.fst $chaindir/0.trans_mdl scp:- ark:- |"} \ $egs_opts $chaindir/normalization.fst \ "$valid_feats" $chaindir/tree $chaindir/0.trans_mdl \ ark,s,cs:- "ark:$dir/valid_all.cegs" || exit 1 & @@ -365,6 +376,7 @@ if [ $stage -le 2 ]; then utils/filter_scp.pl $dir/train_subset_uttlist $dir/lat_special.scp \| \ lattice-align-phones --write-compact=false --replace-output-symbols=true $latdir/final.mdl scp:- ark:- \| \ nnet3-chain-split-and-get-egs $chain_supervision_all_opts $ivector_opts --srand=$srand \ + ${graph_posterior_rspecifier:+--graph-posterior-rspecifier="ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $dir/lat_special.scp | chain-lattice-to-post --acoustic-scale=$acwt --fst-scale=$kl_fst_scale $chaindir/den.fst $chaindir/0.trans_mdl scp:- ark:- |"} \ $egs_opts $chaindir/normalization.fst \ "$train_subset_feats" $chaindir/tree $chaindir/0.trans_mdl \ ark,s,cs:- "ark:$dir/train_subset_all.cegs" || exit 1 & @@ -433,6 +445,7 @@ if [ $stage -le 4 ]; then nnet3-chain-split-and-get-egs $chain_supervision_all_opts \ $ivector_opts --srand=\$[JOB+$srand] $egs_opts \ --num-frames-overlap=$frames_overlap_per_eg \ + ${graph_posterior_rspecifier:+--graph-posterior-rspecifier="$graph_posterior_rspecifier"} \ "$feats" $chaindir/tree $chaindir/0.trans_mdl \ ark,s,cs:- ark:- \| \ nnet3-chain-copy-egs --random=true --srand=\$[JOB+$srand] ark:- $egs_list || exit 1; diff --git a/egs/wsj/s5/steps/nnet3/chain/make_weighted_den_fst.sh b/egs/wsj/s5/steps/nnet3/chain/make_weighted_den_fst.sh index 7dade75a0ed..45a48c10c91 100755 --- a/egs/wsj/s5/steps/nnet3/chain/make_weighted_den_fst.sh +++ b/egs/wsj/s5/steps/nnet3/chain/make_weighted_den_fst.sh @@ -86,37 +86,39 @@ else fi fi -if [ $stage -le 1 ]; then - all_phones="" # will contain the names of the .gz files containing phones, - # with some members possibly repeated per the --num-repeats - # option - for n in `seq 0 $[num_alignments-1]`; do - this_num_repeats=${num_repeats_array[$n]} - this_alignment_dir=${ali_dirs[$n]} - num_jobs=$(cat $this_alignment_dir/num_jobs) - if ! [ "$this_num_repeats" -gt 0 ]; then - echo "Expected comma-separated list of integers for --num-repeats option, got '$num_repeats'" - exit 1 - fi +all_phones="" # will contain the names of the .gz files containing phones, + # with some members possibly repeated per the --num-repeats + # option +for n in `seq 0 $[num_alignments-1]`; do + this_num_repeats=${num_repeats_array[$n]} + this_alignment_dir=${ali_dirs[$n]} + num_jobs=$(cat $this_alignment_dir/num_jobs) + if ! [ "$this_num_repeats" -gt 0 ]; then + echo "Expected comma-separated list of integers for --num-repeats option, got '$num_repeats'" + exit 1 + fi + if [ $stage -le 1 ]; then for j in $(seq $num_jobs); do gunzip -c $this_alignment_dir/ali.$j.gz; done | \ ali-to-phones $this_alignment_dir/final.mdl ark:- "ark:|gzip -c >$dir/phones.$n.gz" || exit 1; + fi - all_phones="$all_phones $(for r in $(seq $this_num_repeats); do echo $dir/phones.$n.gz; done)" - done + all_phones="$all_phones $(for r in $(seq $this_num_repeats); do echo $dir/phones.$n.gz; done)" +done +if [ $stage -le 2 ]; then $cmd $dir/log/make_phone_lm_fst.log \ gunzip -c $all_phones \| \ chain-est-phone-lm $lm_opts ark:- $dir/phone_lm.fst || exit 1; rm $dir/phones.*.gz fi -if [ $stage -le 2 ]; then +if [ $stage -le 3 ]; then copy-transition-model ${ali_dirs[0]}/final.mdl $dir/0.trans_mdl || exit 1; fi -if [ $stage -le 3 ]; then +if [ $stage -le 4 ]; then $cmd $dir/log/make_den_fst.log \ chain-make-den-fst $dir/tree $dir/0.trans_mdl \ $dir/phone_lm.fst \ diff --git a/src/chainbin/Makefile b/src/chainbin/Makefile index dcd57ffec2a..31855856fdb 100644 --- a/src/chainbin/Makefile +++ b/src/chainbin/Makefile @@ -13,7 +13,7 @@ BINFILES = chain-est-phone-lm chain-get-supervision chain-make-den-fst \ nnet3-chain-combine nnet3-chain-normalize-egs \ nnet3-chain-e2e-get-egs \ nnet3-chain-split-and-get-egs chain-split-lattices \ - nnet3-chain-split-convert-and-get-egs + nnet3-chain-split-convert-and-get-egs chain-lattice-to-post OBJFILES = diff --git a/src/chainbin/chain-lattice-to-post.cc b/src/chainbin/chain-lattice-to-post.cc new file mode 100644 index 00000000000..561014cf424 --- /dev/null +++ b/src/chainbin/chain-lattice-to-post.cc @@ -0,0 +1,201 @@ +// chainbin/chain-lattice-to-post.cc + +// Copyright 2017 Vimal Manohar + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "base/kaldi-common.h" +#include "util/common-utils.h" +#include "hmm/transition-model.h" +#include "hmm/posterior.h" +#include "lat/lattice-functions.h" +#include "chain/chain-supervision.h" + +namespace kaldi { +namespace chain { + +/** This function converts lattice to FSA with weight equal to + sum of acoustic and language score, and pdf_id + 1 as labels. + This assumes that the acoustic and language scores are scaled appropriately. +*/ +void ConvertLatticeToPdfLabels( + const TransitionModel &tmodel, + const Lattice &ifst, + fst::StdVectorFst *ofst) { + typedef fst::ArcTpl ArcIn; + typedef fst::StdArc ArcOut; + typedef ArcIn::StateId StateId; + ofst->DeleteStates(); + // The states will be numbered exactly the same as the original FST. + // Add the states to the new FST. + StateId num_states = ifst.NumStates(); + for (StateId s = 0; s < num_states; s++) + ofst->AddState(); + ofst->SetStart(ifst.Start()); + for (StateId s = 0; s < num_states; s++) { + LatticeWeight final_iweight = ifst.Final(s); + if (final_iweight != LatticeWeight::Zero()) { + fst::TropicalWeight final_oweight; + ConvertLatticeWeight(final_iweight, &final_oweight); + ofst->SetFinal(s, final_oweight); + } + for (fst::ArcIterator iter(ifst, s); + !iter.Done(); + iter.Next()) { + const ArcIn &arc = iter.Value(); + KALDI_PARANOID_ASSERT(arc.weight != LatticeWeight::Zero()); + ArcOut oarc; + ConvertLatticeWeight(arc.weight, &oarc.weight); + if (arc.ilabel == 0) + oarc.ilabel = 0; // epsilon arc + else + oarc.ilabel = tmodel.TransitionIdToPdf(arc.ilabel) + 1; // pdf + 1 + oarc.olabel = oarc.ilabel; + oarc.nextstate = arc.nextstate; + ofst->AddArc(s, oarc); + } + } +} + +void LatticeToNumeratorPost(const Lattice &lat, + const TransitionModel &trans_model, + const fst::StdVectorFst &fst, + BaseFloat lm_scale, std::string key, + Posterior *post) { + fst::StdVectorFst sup_fst; + ConvertLatticeToPdfLabels(trans_model, lat, &sup_fst); + + if (!AddWeightToFst(fst, &sup_fst)) { + KALDI_WARN << "For utterance " << key << ", feature frames " + << ", FST was empty after composing with normalization FST. " + << "This should be extremely rare (a few per corpus, at most)"; + } + + // Convert fst to lattice to extract posterior using forward backward. + Lattice lat_copy; + ConvertFstToLattice(sup_fst, &lat_copy); + + kaldi::uint64 props = lat_copy.Properties(fst::kFstProperties, false); + if (!(props & fst::kTopSorted)) { + if (fst::TopSort(&lat_copy) == false) + KALDI_ERR << "Cycles detected in lattice."; + } + + LatticeForwardBackward(lat_copy, post); +} + +} // namespace chain +} // namespace kaldi + + +int main(int argc, char *argv[]) { + try { + using namespace kaldi; + using namespace kaldi::chain; + typedef kaldi::int32 int32; + typedef kaldi::int64 int64; + + const char *usage = + "Do forward-backward and collect pdf posteriors over lattices.\n" + "The labels are converted to a 1-index i.e. pdf-id + 1\n" + "An FST with labels as the 1-indexed pdf-ids can be optionally " + "provided to interpolate with the LM scores from lattice.\n" + "Usage: chain-lattice-to-post [options] [] " + "\n" + "\n"; + + BaseFloat acoustic_scale = 1.0, fst_scale = 0.0; + + ParseOptions po(usage); + po.Register("acoustic-scale", &acoustic_scale, + "Scaling factor for acoustic likelihoods"); + po.Register("fst-scale", &fst_scale, + "Scaling factor for the that will interpolated " + "with the lattice." + "Effectively this is (1-fst_scale) * lattice-graph-cost + fst_scale * fst-costs"); + po.Read(argc, argv); + + if (po.NumArgs() != 3 && po.NumArgs() != 4) { + po.PrintUsage(); + exit(1); + } + + std::string trans_model_rxfilename, + lattice_rspecifier, + fst_rxfilename, + post_wspecifier; + + if (po.NumArgs() == 3) { + trans_model_rxfilename = po.GetArg(1); + lattice_rspecifier = po.GetArg(2); + post_wspecifier = po.GetArg(3); + } else { + fst_rxfilename = po.GetArg(1); + trans_model_rxfilename = po.GetArg(2); + lattice_rspecifier = po.GetArg(3); + post_wspecifier = po.GetArg(4); + } + + TransitionModel trans_model; + ReadKaldiObject(trans_model_rxfilename, &trans_model); + + fst::StdVectorFst fst; + if (!fst_rxfilename.empty()) { + ReadFstKaldi(fst_rxfilename, &fst); + KALDI_ASSERT(fst.NumStates() > 0); + + if (fst_scale < 0.0 || fst_scale > 1.0) { + KALDI_ERR << "Invalid fst-scale; must be in [0.0, 1.0)"; + } + + if (fst_scale != 1.0) { + fst::ApplyProbabilityScale(fst_scale, &fst); + } + } + + fst::RmEpsilon(&fst); + fst::ArcSort(&fst, fst::ILabelCompare()); + + SequentialLatticeReader lattice_reader(lattice_rspecifier); + PosteriorWriter posterior_writer(post_wspecifier); + + int32 num_done = 0; + for (; !lattice_reader.Done(); lattice_reader.Next()) { + std::string key = lattice_reader.Key(); + + Lattice lat = lattice_reader.Value(); + + fst::ScaleLattice(fst::LatticeScale(1.0 - fst_scale, acoustic_scale), &lat); + + Posterior graph_post; + LatticeToNumeratorPost( + lat, trans_model, fst, + 1.0 - fst_scale , key, &graph_post); + + posterior_writer.Write(key, graph_post); + num_done++; + } + + KALDI_LOG << "Converted " << num_done << " lattices to posteriors"; + + return num_done > 0 ? 0 : 1; + } catch(const std::exception &e) { + std::cerr << e.what() << '\n'; + return -1; + } +} diff --git a/src/chainbin/nnet3-chain-split-and-get-egs.cc b/src/chainbin/nnet3-chain-split-and-get-egs.cc index 587deda9e32..39d99e84cc2 100644 --- a/src/chainbin/nnet3-chain-split-and-get-egs.cc +++ b/src/chainbin/nnet3-chain-split-and-get-egs.cc @@ -121,9 +121,9 @@ static bool ProcessFile(const chain::SupervisionOptions &sup_opts, int32 ivector_period, const TransitionModel &trans_model, const chain::SupervisionLatticeSplitter &sup_lat_splitter, - const VectorBase *deriv_weights, - bool include_numerator_post, BaseFloat min_post, - int32 supervision_length_tolerance, + const VectorBase *deriv_weights, + const Posterior *graph_posteriors, BaseFloat min_post, + int32 supervision_length_tolerance, const std::string &utt_id, bool compress, UtteranceSplitter *utt_splitter, @@ -168,44 +168,22 @@ static bool ProcessFile(const chain::SupervisionOptions &sup_opts, chain::Supervision supervision_part; - Lattice *lat_part = NULL; - - if (include_numerator_post) - lat_part = new Lattice(); - if (!sup_lat_splitter.GetFrameRangeSupervision(start_frame_subsampled, num_frames_subsampled, - &supervision_part, NULL, - lat_part)) + &supervision_part)) return false; - if (include_numerator_post) { - Posterior pdf_post; - LatticeToNumeratorPost( - *lat_part, trans_model, normalization_fst, - sup_opts.lm_scale, utt_id, &pdf_post); - KALDI_ASSERT(pdf_post.size() == num_frames_subsampled); - - Posterior check_post; - if (GetVerboseLevel() >= 2) { - LatticeToNumeratorPost( - sup_lat_splitter.GetLattice(), trans_model, normalization_fst, - sup_opts.lm_scale, utt_id, &check_post); - } - + if (graph_posteriors) { Posterior labels; labels.resize(num_frames_subsampled); for (int32 i = 0; i < num_frames_subsampled; i++) { - for (int32 j = 0; j < pdf_post[i].size(); j++) { - BaseFloat post = pdf_post[i][j].second; - KALDI_ASSERT(pdf_post[i][j].first > 0); - KALDI_VLOG(2) << pdf_post[i][j].first << " " << pdf_post[i][j].second - << "; " - << check_post[i + start_frame_subsampled][j].first - << check_post[i + start_frame_subsampled][j].second; + int32 t = i + start_frame_subsampled; + for (int32 j = 0; j < (*graph_posteriors)[t].size(); j++) { + BaseFloat post = (*graph_posteriors)[t][j].second; + KALDI_ASSERT((*graph_posteriors)[t][j].first > 0); if (post > min_post) { labels[i].push_back(std::make_pair( - pdf_post[i][j].first - 1, post)); // Convert from 1-index to 0-index + (*graph_posteriors)[t][j].first - 1, post)); // Convert from 1-index to 0-index } } } @@ -334,9 +312,9 @@ int main(int argc, char *argv[]) { chain::SupervisionOptions sup_opts; int32 srand_seed = 0; - std::string online_ivector_rspecifier, deriv_weights_rspecifier; + std::string online_ivector_rspecifier, deriv_weights_rspecifier, + graph_posterior_rspecifier; - bool include_numerator_post = true; BaseFloat min_post = 1e-8; ParseOptions po(usage); @@ -362,8 +340,8 @@ int main(int argc, char *argv[]) { "whether a frame's gradient must be backpropagated or not. " "Not specifying this is equivalent to specifying a vector of " "all 1s."); - po.Register("include-numerator-post", &include_numerator_post, - "Include numerator posterior"); + po.Register("graph-posterior-rspecifier", &graph_posterior_rspecifier, + "Pdf posteriors where the labels are 1-indexed"); po.Register("min-post", &min_post, "Minimum posterior to keep; this will " "avoid dumping out all posteriors."); @@ -440,6 +418,8 @@ int main(int argc, char *argv[]) { online_ivector_rspecifier); RandomAccessBaseFloatVectorReader deriv_weights_reader( deriv_weights_rspecifier); + RandomAccessPosteriorReader graph_posterior_reader( + graph_posterior_rspecifier); int32 num_err = 0; @@ -493,12 +473,25 @@ int main(int argc, char *argv[]) { } } + const Posterior *graph_posteriors = NULL; + if (!graph_posterior_rspecifier.empty()) { + if (!graph_posterior_reader.HasKey(key)) { + KALDI_WARN << "No graph posteriors for utterance " << key; + num_err++; + continue; + } else { + // this address will be valid until we call HasKey() or Value() + // again. + graph_posteriors = &(graph_posterior_reader.Value(key)); + } + } + sup_lat_splitter.LoadLattice(lat); if (!ProcessFile(sup_opts, normalization_fst, feats, online_ivector_feats, online_ivector_period, trans_model, sup_lat_splitter, - deriv_weights, include_numerator_post, min_post, + deriv_weights, graph_posteriors, min_post, supervision_length_tolerance, key, compress, &utt_splitter, &example_writer)) diff --git a/src/nnet3/nnet-chain-diagnostics.cc b/src/nnet3/nnet-chain-diagnostics.cc index fae2a1e6996..159ab22e4b6 100644 --- a/src/nnet3/nnet-chain-diagnostics.cc +++ b/src/nnet3/nnet-chain-diagnostics.cc @@ -175,6 +175,11 @@ void NnetChainComputeProb::ProcessOutputs(const NnetChainExample &eg, auto it = smbr_factors_.find(sup.name); if (it != smbr_factors_.end()) chain_config_copy.smbr_factor = it->second; + + if (chain_config_copy.smbr_factor > 0.0 && !chain_config_copy.use_smbr_objective) + KALDI_ERR << "smbr factor for " << sup.name << " = " + << chain_config_copy.smbr_factor + << " > 0.0, but --use-smbr-objective=false"; } { auto it = mmi_factors_.find(sup.name); @@ -248,7 +253,7 @@ void NnetChainComputeProb::ProcessOutputs(const NnetChainExample &eg, std::vector aux_objfs; aux_objfs.push_back(tot_l2_term); - if (chain_config_copy.use_smbr_objective) + if (chain_config_copy.smbr_factor > 0.0) aux_objfs.push_back(tot_mmi_objf); { @@ -258,7 +263,7 @@ void NnetChainComputeProb::ProcessOutputs(const NnetChainExample &eg, if (it == objf_info_.end()) { BaseFloat this_objf_scale = 1.0; std::vector aux_objf_scales(1, 1.0); // l2_term - if (chain_config_copy.use_smbr_objective) { + if (chain_config_copy.smbr_factor > 0.0) { this_objf_scale *= chain_config_copy.smbr_factor; aux_objf_scales.push_back( (chain_config_copy.mmi_factor + chain_config_copy.ml_factor)); diff --git a/src/nnet3/nnet-chain-training.cc b/src/nnet3/nnet-chain-training.cc index 1719e9d2e6b..95732128cf9 100644 --- a/src/nnet3/nnet-chain-training.cc +++ b/src/nnet3/nnet-chain-training.cc @@ -485,6 +485,11 @@ void NnetChainTrainer::ProcessOutputs(bool is_backstitch_step2, auto it = smbr_factors_.find(sup.name); if (it != smbr_factors_.end()) chain_config.smbr_factor = it->second; + + if (chain_config.smbr_factor > 0.0 && !chain_config.use_smbr_objective) + KALDI_ERR << "smbr factor for " << sup.name << " = " + << chain_config.smbr_factor + << " > 0.0, but --use-smbr-objective=false"; } { auto it = mmi_factors_.find(sup.name); @@ -565,7 +570,7 @@ void NnetChainTrainer::ProcessOutputs(bool is_backstitch_step2, std::vector objective_values; objective_values.push_back(tot_l2_term); - if (chain_config.use_smbr_objective) + if (chain_config.smbr_factor > 0.0) objective_values.push_back(tot_mmi_objf); { @@ -575,7 +580,7 @@ void NnetChainTrainer::ProcessOutputs(bool is_backstitch_step2, if (it == objf_info_.end()) { BaseFloat this_objf_scale = 1.0; std::vector aux_objf_scales(1, 1.0); // l2_term - if (chain_config.use_smbr_objective) { + if (chain_config.smbr_factor > 0.0) { this_objf_scale *= chain_config.smbr_factor; aux_objf_scales.push_back( (chain_config.mmi_factor + chain_config.ml_factor)); From 173f0c6a66f3dc8235fc8b6ad0e88a56bea770c3 Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Wed, 18 Apr 2018 12:08:32 -0400 Subject: [PATCH 147/174] Minor fix --- src/chain/chain-training.cc | 2 +- src/chain/chain-training.h | 2 +- src/nnet3/nnet-chain-diagnostics.cc | 2 +- src/nnet3/nnet-chain-training.cc | 11 ++++++----- 4 files changed, 9 insertions(+), 8 deletions(-) diff --git a/src/chain/chain-training.cc b/src/chain/chain-training.cc index f54bb671312..9f46e1b2e83 100644 --- a/src/chain/chain-training.cc +++ b/src/chain/chain-training.cc @@ -141,7 +141,7 @@ void ComputeChainObjfAndDerivE2e(const ChainTrainingOptions &opts, } } -void ComputeKLNumeratorObjfAndDeriv(const ChainTrainingOptions &opts, +void ComputeChainDenominatorObjfAndDeriv(const ChainTrainingOptions &opts, const DenominatorGraph &den_graph, const CuMatrixBase &nnet_output, BaseFloat supervision_weight, int32 num_sequences, diff --git a/src/chain/chain-training.h b/src/chain/chain-training.h index a5edf784795..d4086d887d0 100644 --- a/src/chain/chain-training.h +++ b/src/chain/chain-training.h @@ -235,7 +235,7 @@ void ComputeChainSmbrObjfAndDeriv( This function uses supervision as numerator and does denominator computation. It can be uses, where numerator is fixed e.g. TS learning. */ -void ComputeKLNumeratorObjfAndDeriv(const ChainTrainingOptions &opts, +void ComputeChainDenominatorObjfAndDeriv(const ChainTrainingOptions &opts, const DenominatorGraph &den_graph, const CuMatrixBase &nnet_output, BaseFloat supervision_weight, int32 num_sequences, diff --git a/src/nnet3/nnet-chain-diagnostics.cc b/src/nnet3/nnet-chain-diagnostics.cc index 159ab22e4b6..d6ceb93db5d 100644 --- a/src/nnet3/nnet-chain-diagnostics.cc +++ b/src/nnet3/nnet-chain-diagnostics.cc @@ -235,7 +235,7 @@ void NnetChainComputeProb::ProcessOutputs(const NnetChainExample &eg, computer->GetOutput(sup.name + "-teacher"); BaseFloat num_objf = 0, num_weight = 0.0; - ComputeKLNumeratorObjfAndDeriv(chain_config_copy, den_graph_, teacher_nnet_output, + ComputeChainDenominatorObjfAndDeriv(chain_config_copy, den_graph_, teacher_nnet_output, sup.supervision.weight, sup.supervision.num_sequences, &num_objf, &num_weight, &nnet_output_deriv, diff --git a/src/nnet3/nnet-chain-training.cc b/src/nnet3/nnet-chain-training.cc index 95732128cf9..fe41f78390f 100644 --- a/src/nnet3/nnet-chain-training.cc +++ b/src/nnet3/nnet-chain-training.cc @@ -539,11 +539,12 @@ void NnetChainTrainer::ProcessOutputs(bool is_backstitch_step2, computer->GetOutput(sup.name + "-teacher"); BaseFloat num_objf = 0, num_weight = 0.0; - ComputeKLNumeratorObjfAndDeriv(chain_config, den_graph_, teacher_nnet_output, - sup.supervision.weight, sup.supervision.num_sequences, - &num_objf, &num_weight, - &nnet_output_deriv, - (use_xent ? &xent_deriv : NULL)); + ComputeChainDenominatorObjfAndDeriv( + chain_config, den_graph_, teacher_nnet_output, + sup.supervision.weight, sup.supervision.num_sequences, + &num_objf, &num_weight, + &nnet_output_deriv, + (use_xent ? &xent_deriv : NULL)); } } From 90518d7dbaf5f3e42599ec09a07a410ad79e8579 Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Mon, 23 Apr 2018 16:27:35 -0400 Subject: [PATCH 148/174] Adding graph post --- .../nnet3/train/chain_objf/acoustic_model.py | 4 -- .../steps/nnet3/chain/get_chain_graph_post.sh | 36 +++++++++++++ egs/wsj/s5/steps/nnet3/chain/get_egs_split.sh | 52 ++++++++----------- egs/wsj/s5/steps/nnet3/chain/train.py | 2 - src/bin/decode-faster.cc | 6 ++- 5 files changed, 62 insertions(+), 38 deletions(-) create mode 100755 egs/wsj/s5/steps/nnet3/chain/get_chain_graph_post.sh diff --git a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py index f5bf3b2d63f..475cbdc0fd7 100644 --- a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py +++ b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py @@ -80,7 +80,6 @@ def generate_chain_egs(dir, data, lat_dir, egs_dir, """{get_egs_script} {egs_opts} \ --cmd "{command}" \ --cmvn-opts "{cmvn_opts}" \ - --transform-dir "{transform_dir}" \ --online-ivector-dir "{ivector_dir}" \ --left-context {left_context} \ --right-context {right_context} \ @@ -98,9 +97,6 @@ def generate_chain_egs(dir, data, lat_dir, egs_dir, get_egs_script=get_egs_script, command=run_opts.egs_command, cmvn_opts=cmvn_opts if cmvn_opts is not None else '', - transform_dir=(transform_dir - if transform_dir is not None - else ''), ivector_dir=(online_ivector_dir if online_ivector_dir is not None else ''), diff --git a/egs/wsj/s5/steps/nnet3/chain/get_chain_graph_post.sh b/egs/wsj/s5/steps/nnet3/chain/get_chain_graph_post.sh new file mode 100755 index 00000000000..02c53ce93c7 --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/chain/get_chain_graph_post.sh @@ -0,0 +1,36 @@ +#! /bin/bash + +# Copyright 2018 Vimal Manohar +# Apache 2.0 + +fst_scale=0.5 +acwt=0.1 +cmd=run.pl + +. ./cmd.sh +. utils/parse_options.sh + +echo $* + +if [ $# -ne 3 ]; then + echo "Usage: $0 " + echo " e.g.: $0 exp/chain/tdnn exp/chain/tri5_lats exp/chain/tdnn/egs" + exit 1 +fi + +chaindir=$1 +latdir=$2 +dir=$3 + +nj=$(cat $latdir/num_jobs) || exit 1 + +lats_rspecifier="ark:gunzip -c $latdir/lat.JOB.gz |" + +$cmd JOB=1:$nj $dir/get_post.JOB.log \ + chain-lattice-to-post --acoustic-scale=$acwt --fst-scale=$fst_scale \ + $chaindir/den.fst $chaindir/0.trans_mdl "$lats_rspecifier" \ + ark,scp:$dir/numerator_post.JOB.ark,$dir/numerator_post.JOB.scp || exit 1 + +for n in $(seq $nj); do + cat $dir/numerator_post.$n.scp +done > $dir/numerator_post.scp diff --git a/egs/wsj/s5/steps/nnet3/chain/get_egs_split.sh b/egs/wsj/s5/steps/nnet3/chain/get_egs_split.sh index 85d07f7f8d9..33c3cb50e79 100755 --- a/egs/wsj/s5/steps/nnet3/chain/get_egs_split.sh +++ b/egs/wsj/s5/steps/nnet3/chain/get_egs_split.sh @@ -52,7 +52,8 @@ left_tolerance= right_tolerance_silence= # Tolerances for silence phones left_tolerance_silence= -transform_dir= # If supplied, overrides latdir as the place to find fMLLR transforms +kl_latdir= +kl_fst_scale=0.5 stage=0 max_jobs_run=15 # This should be set to the maximum number of nnet3-chain-get-egs jobs you are @@ -184,18 +185,6 @@ if [ $len_uttlist -lt $num_utts_subset ]; then echo "Number of utterances which have length at least $frames_per_eg is really low. Please check your data." && exit 1; fi -[ -z "$transform_dir" ] && transform_dir=$latdir - -# because we'll need the features with a different number of jobs than $latdir, -# copy to ark,scp. -if [ -f $transform_dir/raw_trans.1 ]; then - echo "$0: using raw transforms from $transform_dir" - if [ $stage -le 0 ]; then - $cmd $dir/log/copy_transforms.log \ - copy-feats "ark:cat $transform_dir/raw_trans.* |" "ark,scp:$dir/trans.ark,$dir/trans.scp" - fi -fi - ## Set up features. echo "$0: feature type is raw" feats="ark,s,cs:utils/filter_scp.pl --exclude $dir/valid_uttlist $sdata/JOB/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:- ark:- |" @@ -203,12 +192,6 @@ valid_feats="ark,s,cs:utils/filter_scp.pl $dir/valid_uttlist $data/feats.scp | a train_subset_feats="ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- |" echo $cmvn_opts >$dir/cmvn_opts # caution: the top-level nnet training script should copy this to its own dir now. -if [ -f $dir/trans.scp ]; then - feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk scp:$dir/trans.scp ark:- ark:- |" - valid_feats="$valid_feats transform-feats --utt2spk=ark:$data/utt2spk scp:$dir/trans.scp ark:- ark:- |" - train_subset_feats="$train_subset_feats transform-feats --utt2spk=ark:$data/utt2spk scp:$dir/trans.scp ark:- ark:- |" -fi - tree-info $chaindir/tree | grep num-pdfs | awk '{print $2}' > $dir/info/num_pdfs || exit 1 if [ ! -z "$online_ivector_dir" ]; then @@ -222,7 +205,7 @@ else echo 0 >$dir/info/ivector_dim fi -if [ $stage -le 1 ]; then +if [ $stage -le 0 ]; then echo "$0: working out number of frames of training data" num_frames=$(steps/nnet2/get_num_frames.sh $data) echo $num_frames > $dir/info/num_frames @@ -323,15 +306,6 @@ if [ ! -z "$lattice_lm_scale" ]; then print (1.0 - $lattice_lm_scale);") fi -if [ -z $kl_fst_scale ]; then - kl_fst_scale=$normalization_fst_scale -fi - -graph_posterior_rspecifier= -if $include_numerator_post; then - graph_posterior_rspecifier="$lats_rspecifier chain-lattice-to-post --acoustic-scale=$acwt --fst-scale=$kl_fst_scale $chaindir/den.fst $chaindir/0.trans_mdl ark:- ark:- |" -fi - [ ! -z $phone_insertion_penalty ] && \ chain_supervision_all_opts="$chain_supervision_all_opts --supervision.phone-ins-penalty=$phone_insertion_penalty" @@ -352,6 +326,22 @@ echo $right_context > $dir/info/right_context echo $left_context_initial > $dir/info/left_context_initial echo $right_context_final > $dir/info/right_context_final +graph_posterior_rspecifier= +if [ ! -z "$kl_latdir" ]; then + if [ $stage -le 1 ]; then + steps/nnet3/chain/get_chain_graph_post.sh \ + --cmd "$cmd" --fst-scale $kl_fst_scale --acwt $acwt \ + $chaindir $kl_latdir $dir || exit 1 + fi + + if [ ! -s "$dir/numerator_post.scp" ]; then + echo "$0: Could not find $dir/numerator_post.scp. Something went wrong." + exit 1 + fi + + graph_posterior_rspecifier=scp:$dir/numerator_post.scp +fi + if [ $stage -le 2 ]; then echo "$0: Getting validation and training subset examples in background." rm $dir/.error 2>/dev/null @@ -368,7 +358,7 @@ if [ $stage -le 2 ]; then utils/filter_scp.pl $dir/valid_uttlist $dir/lat_special.scp \| \ lattice-align-phones --write-compact=false --replace-output-symbols=true $latdir/final.mdl scp:- ark:- \| \ nnet3-chain-split-and-get-egs $chain_supervision_all_opts $ivector_opts --srand=$srand \ - ${graph_posterior_rspecifier:+--graph-posterior-rspecifier="ark,s,cs:utils/filter_scp.pl $dir/valid_uttlist $dir/lat_special.scp | chain-lattice-to-post --acoustic-scale=$acwt --fst-scale=$kl_fst_scale $chaindir/den.fst $chaindir/0.trans_mdl scp:- ark:- |"} \ + ${graph_posterior_rspecifier:+--graph-posterior-rspecifier=$graph_posterior_rspecifier} \ $egs_opts $chaindir/normalization.fst \ "$valid_feats" $chaindir/tree $chaindir/0.trans_mdl \ ark,s,cs:- "ark:$dir/valid_all.cegs" || exit 1 & @@ -376,7 +366,7 @@ if [ $stage -le 2 ]; then utils/filter_scp.pl $dir/train_subset_uttlist $dir/lat_special.scp \| \ lattice-align-phones --write-compact=false --replace-output-symbols=true $latdir/final.mdl scp:- ark:- \| \ nnet3-chain-split-and-get-egs $chain_supervision_all_opts $ivector_opts --srand=$srand \ - ${graph_posterior_rspecifier:+--graph-posterior-rspecifier="ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $dir/lat_special.scp | chain-lattice-to-post --acoustic-scale=$acwt --fst-scale=$kl_fst_scale $chaindir/den.fst $chaindir/0.trans_mdl scp:- ark:- |"} \ + ${graph_posterior_rspecifier:+--graph-posterior-rspecifier=$graph_posterior_rspecifier} \ $egs_opts $chaindir/normalization.fst \ "$train_subset_feats" $chaindir/tree $chaindir/0.trans_mdl \ ark,s,cs:- "ark:$dir/train_subset_all.cegs" || exit 1 & diff --git a/egs/wsj/s5/steps/nnet3/chain/train.py b/egs/wsj/s5/steps/nnet3/chain/train.py index 818e89a1d0a..7fe5258a79d 100755 --- a/egs/wsj/s5/steps/nnet3/chain/train.py +++ b/egs/wsj/s5/steps/nnet3/chain/train.py @@ -274,8 +274,6 @@ def process_args(args): "and exist; or the {0}/configs directory should exist." "".format(args.dir)) - if args.transform_dir is None: - args.transform_dir = args.lat_dir # set the options corresponding to args.use_gpu run_opts = common_train_lib.RunOpts() if args.use_gpu in ["true", "false"]: diff --git a/src/bin/decode-faster.cc b/src/bin/decode-faster.cc index cbcdb771d56..30c4988dcee 100644 --- a/src/bin/decode-faster.cc +++ b/src/bin/decode-faster.cc @@ -39,7 +39,11 @@ int main(int argc, char *argv[]) { const char *usage = "Decode, reading log-likelihoods (of transition-ids or whatever symbol is on the graph)\n" - "as matrices. Note: you'll usually want decode-faster-mapped rather than this program.\n" + "as matrices. " + "The matrixes are 0-indexed, while the symbol on the graph is 1-indexed. So " + "the column i of matrix corresponds to likelihood of symbol i+1 in the graph.\n" + "Note: you'll usually want decode-faster-mapped rather than this program for " + "decoding acoustic models.\n" "\n" "Usage: decode-faster [options] []\n"; ParseOptions po(usage); From 2cf2acb2d69ac2b94b6c6de3dacff4d6a95f1593 Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Mon, 23 Apr 2018 17:27:59 -0400 Subject: [PATCH 149/174] kl-latdir fix log dir --- egs/wsj/s5/steps/nnet3/chain/get_chain_graph_post.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/egs/wsj/s5/steps/nnet3/chain/get_chain_graph_post.sh b/egs/wsj/s5/steps/nnet3/chain/get_chain_graph_post.sh index 02c53ce93c7..a582e9efc40 100755 --- a/egs/wsj/s5/steps/nnet3/chain/get_chain_graph_post.sh +++ b/egs/wsj/s5/steps/nnet3/chain/get_chain_graph_post.sh @@ -7,11 +7,11 @@ fst_scale=0.5 acwt=0.1 cmd=run.pl +echo $* + . ./cmd.sh . utils/parse_options.sh -echo $* - if [ $# -ne 3 ]; then echo "Usage: $0 " echo " e.g.: $0 exp/chain/tdnn exp/chain/tri5_lats exp/chain/tdnn/egs" @@ -26,7 +26,7 @@ nj=$(cat $latdir/num_jobs) || exit 1 lats_rspecifier="ark:gunzip -c $latdir/lat.JOB.gz |" -$cmd JOB=1:$nj $dir/get_post.JOB.log \ +$cmd JOB=1:$nj $dir/log/get_post.JOB.log \ chain-lattice-to-post --acoustic-scale=$acwt --fst-scale=$fst_scale \ $chaindir/den.fst $chaindir/0.trans_mdl "$lats_rspecifier" \ ark,scp:$dir/numerator_post.JOB.ark,$dir/numerator_post.JOB.scp || exit 1 From 6564ff197778ba54156ff3bda27984debfa00f3b Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Wed, 25 Apr 2018 15:58:56 -0400 Subject: [PATCH 150/174] semisup: Bug fix in context info --- .../semisup/chain/tuning/run_tdnn_100k_semisupervised_1a.sh | 6 ------ .../semisup/chain/tuning/run_tdnn_50k_semisupervised_1a.sh | 6 ------ 2 files changed, 12 deletions(-) diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_1a.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_1a.sh index 9ba7da6e361..e95de232304 100644 --- a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_1a.sh +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_1a.sh @@ -285,13 +285,9 @@ fi left_context=$model_left_context right_context=$model_right_context -left_context_initial=$model_left_context -right_context_final=$model_right_context egs_left_context=$(perl -e "print int($left_context + $frame_subsampling_factor / 2)") egs_right_context=$(perl -e "print int($right_context + $frame_subsampling_factor / 2)") -egs_left_context_initial=$(perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)") -egs_right_context_final=$(perl -e "print int($right_context_final + $frame_subsampling_factor / 2)") if [ -z "$sup_egs_dir" ]; then sup_egs_dir=$dir/egs_${supervised_set_perturbed} @@ -308,7 +304,6 @@ if [ -z "$sup_egs_dir" ]; then echo "$0: generating egs from the supervised data" steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ --left-context $egs_left_context --right-context $egs_right_context \ - --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ --frame-subsampling-factor $frame_subsampling_factor \ --alignment-subsampling-factor $frame_subsampling_factor \ --frames-per-eg $frames_per_eg \ @@ -349,7 +344,6 @@ if [ -z "$unsup_egs_dir" ]; then --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ --left-tolerance $tolerance --right-tolerance $tolerance \ --left-context $egs_left_context --right-context $egs_right_context \ - --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ --frame-subsampling-factor $frame_subsampling_factor \ --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_1a.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_1a.sh index ad5d2b106b5..2d5b2f8480e 100755 --- a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_1a.sh +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_1a.sh @@ -304,13 +304,9 @@ fi left_context=$model_left_context right_context=$model_right_context -left_context_initial=$model_left_context -right_context_final=$model_right_context egs_left_context=$(perl -e "print int($left_context + $frame_subsampling_factor / 2)") egs_right_context=$(perl -e "print int($right_context + $frame_subsampling_factor / 2)") -egs_left_context_initial=$(perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)") -egs_right_context_final=$(perl -e "print int($right_context_final + $frame_subsampling_factor / 2)") if [ -z "$sup_egs_dir" ]; then sup_egs_dir=$dir/egs_${supervised_set_perturbed} @@ -327,7 +323,6 @@ if [ -z "$sup_egs_dir" ]; then echo "$0: generating egs from the supervised data" steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ --left-context $egs_left_context --right-context $egs_right_context \ - --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ --frame-subsampling-factor $frame_subsampling_factor \ --alignment-subsampling-factor $frame_subsampling_factor \ --frames-per-eg $frames_per_eg \ @@ -368,7 +363,6 @@ if [ -z "$unsup_egs_dir" ]; then --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ --left-tolerance $tolerance --right-tolerance $tolerance \ --left-context $egs_left_context --right-context $egs_right_context \ - --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ --frame-subsampling-factor $frame_subsampling_factor \ --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ From d0caae41cb587cce886dc2f389aafba7b69f47f4 Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Fri, 27 Apr 2018 13:18:19 -0400 Subject: [PATCH 151/174] Adding new recipes --- ..._100k_250k_ex250k_semisupervised_conf_e.sh | 529 +++++++++++++++++ ..._100k_250k_ex250k_semisupervised_conf_f.sh | 525 +++++++++++++++++ ..._250k_ex250k_semisupervised_conf_smbr_a.sh | 523 +++++++++++++++++ ..._250k_ex250k_semisupervised_conf_smbr_b.sh | 529 +++++++++++++++++ ...nn_lstm_100k_250k_semisupervised_conf_d.sh | 517 +++++++++++++++++ ...nn_lstm_100k_250k_semisupervised_conf_e.sh | 516 +++++++++++++++++ ...nn_lstm_100k_250k_semisupervised_conf_f.sh | 516 +++++++++++++++++ ...nn_lstm_100k_250k_semisupervised_conf_g.sh | 516 +++++++++++++++++ ...tm_100k_250k_semisupervised_conf_smbr_a.sh | 512 +++++++++++++++++ ...tm_100k_250k_semisupervised_conf_smbr_b.sh | 516 +++++++++++++++++ ..._100k_500k_ex500k_semisupervised_smbr_a.sh | 531 ++++++++++++++++++ ...nn_lstm_100k_500k_semisupervised_conf_d.sh | 516 +++++++++++++++++ ...lstm_100k_500k_semisupervised_conf_kl_a.sh | 442 +++++++++++++++ ...tm_100k_500k_semisupervised_conf_smbr_b.sh | 518 +++++++++++++++++ ...tm_100k_500k_semisupervised_conf_smbr_c.sh | 446 +++++++++++++++ .../chain/tuning/run_tdnn_lstm_100k_d.sh | 228 ++++++++ .../chain/tuning/run_tdnn_lstm_100k_e.sh | 228 ++++++++ .../chain/tuning/run_tdnn_lstm_100k_f.sh | 228 ++++++++ .../chain/tuning/run_tdnn_lstm_100k_g.sh | 228 ++++++++ .../tuning/run_tdnn_lstm_100k_oracle_d.sh | 247 ++++++++ .../tuning/run_tdnn_lstm_100k_smbr_a.sh.orig | 240 ++++++++ 21 files changed, 9051 insertions(+) create mode 100644 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_250k_ex250k_semisupervised_conf_e.sh create mode 100644 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_250k_ex250k_semisupervised_conf_f.sh create mode 100644 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_250k_ex250k_semisupervised_conf_smbr_a.sh create mode 100644 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_250k_ex250k_semisupervised_conf_smbr_b.sh create mode 100644 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_250k_semisupervised_conf_d.sh create mode 100644 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_250k_semisupervised_conf_e.sh create mode 100644 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_250k_semisupervised_conf_f.sh create mode 100644 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_250k_semisupervised_conf_g.sh create mode 100644 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_250k_semisupervised_conf_smbr_a.sh create mode 100644 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_250k_semisupervised_conf_smbr_b.sh create mode 100644 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_500k_ex500k_semisupervised_smbr_a.sh create mode 100644 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_500k_semisupervised_conf_d.sh create mode 100644 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_500k_semisupervised_conf_kl_a.sh create mode 100644 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_500k_semisupervised_conf_smbr_b.sh create mode 100644 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_500k_semisupervised_conf_smbr_c.sh create mode 100755 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_d.sh create mode 100755 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_e.sh create mode 100755 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_f.sh create mode 100755 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_g.sh create mode 100755 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_oracle_d.sh create mode 100755 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_smbr_a.sh.orig diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_250k_ex250k_semisupervised_conf_e.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_250k_ex250k_semisupervised_conf_e.sh new file mode 100644 index 00000000000..70dc30bd331 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_250k_ex250k_semisupervised_conf_e.sh @@ -0,0 +1,529 @@ +#!/bin/bash + +# Unsupervised set: train_unsup100k_250k +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 3,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=80 +exp=exp/semisup_100k + +supervised_set=train_sup +unsupervised_set=train_unsup100k_250k +semisup_train_set= # semisup100k_250k + +tdnn_affix=7d_h1024 # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" +tree_affix=bi_c + +nnet3_affix= # affix for nnet3 and chain dir -- relates to i-vector used + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg=150 # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb_250k_ex250k_1e # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=3,2 +num_copies= +sup_egs_dir= +unsup_egs_dir= +unsup_egs_opts= + +remove_egs=false +common_egs_dir= + +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +apply_deriv_weights=true +use_smart_splitting=true + +# training options +num_epochs=4 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +self_repair_scale=0.00001 +label_delay=5 + +# decode options +extra_left_context=50 +extra_right_context=0 + +decode_iter= + +do_finetuning=false + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +if $use_smart_splitting; then + comb_affix=${comb_affix:+${comb_affix}_smart} +else + comb_affix=${comb_affix:+${comb_affix}_naive} +fi + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}_sp${decode_affix}/frame_subsampling_factor +fi + +cmvn_opts=`cat $chaindir/cmvn_opts` || exit 1 + +sup_ali_dir=$exp/tri4a + +treedir=$exp/chain${nnet3_affix}/tree_${tree_affix} +if [ ! -f $treedir/final.mdl ]; then + echo "$0: $treedir/final.mdl does not exist." + exit 1 +fi + +diff $treedir/tree $chaindir/tree || { echo "$0: $treedir/tree and $chaindir/tree differ"; exit 1; } + +dir=$exp/chain${nnet3_affix}/tdnn_lstm${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} + +#if [ $stage -le 9 ]; then +# steps/subset_ali_dir.sh --cmd "$train_cmd" \ +# data/${unsupervised_set} data/${unsupervised_set}_sp_hires \ +# $chaindir/best_path_${unsupervised_set}_sp${decode_affix} \ +# $chaindir/best_path_${unsupervised_set}${decode_affix} +# echo $frame_subsampling_factor > $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +#fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}_sp${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm4 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + output-layer name=output input=lstm4 output-delay=$label_delay dim=$num_targets include-log-softmax=false max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine@$label_delay skip-in-init=true + output name=output-1 input=output.affine@$label_delay skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax@$label_delay skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax@$label_delay skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + chunk_left_context] +right_context=$[model_right_context + chunk_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +egs_left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +egs_right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +egs_left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +egs_right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri4a_${supervised_set}_unk_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} + +if [ -z "$unsup_egs_dir" ]; then + [ -z $unsup_frames_per_eg ] && [ ! -z "$frames_per_eg" ] && unsup_frames_per_eg=$frames_per_eg + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + if $use_smart_splitting; then + get_egs_script=steps/nnet3/chain/get_egs_split.sh + else + get_egs_script=steps/nnet3/chain/get_egs.sh + fi + + $get_egs_script --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 64 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. --lang2num-copies "$num_copies" \ + 2 $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph${test_graph_affix} +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 $test_lang $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --frames-per-chunk 160 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + $graph_dir data/${decode_set}_hires $dir/decode${test_graph_affix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --frames-per-chunk 150 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_250k_ex250k_semisupervised_conf_f.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_250k_ex250k_semisupervised_conf_f.sh new file mode 100644 index 00000000000..91faa23cc6e --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_250k_ex250k_semisupervised_conf_f.sh @@ -0,0 +1,525 @@ +#!/bin/bash + +# Unsupervised set: train_unsup100k_250k +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 3,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=80 +exp=exp/semisup_100k + +supervised_set=train_sup +unsupervised_set=train_unsup100k_250k +semisup_train_set= # semisup100k_250k + +tdnn_affix=7d_h1024 # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" +tree_affix=bi_c + +nnet3_affix= # affix for nnet3 and chain dir -- relates to i-vector used + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg=150 # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb_250k_ex250k_1f # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +chain_smbr_extra_opts="--one-silence-class" +lm_weights=3,2 +num_copies= +sup_egs_dir= +unsup_egs_dir= +unsup_egs_opts= + +remove_egs=false +common_egs_dir= + +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +apply_deriv_weights=true +use_smart_splitting=true + +# training options +num_epochs=4 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +self_repair_scale=0.00001 +label_delay=5 + +# decode options +extra_left_context=50 +extra_right_context=0 + +decode_iter= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +if $use_smart_splitting; then + comb_affix=${comb_affix:+${comb_affix}_smart} +else + comb_affix=${comb_affix:+${comb_affix}_naive} +fi + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}_sp${decode_affix}/frame_subsampling_factor +fi + +cmvn_opts=`cat $chaindir/cmvn_opts` || exit 1 + +sup_ali_dir=$exp/tri4a + +treedir=$exp/chain${nnet3_affix}/tree_${tree_affix} +if [ ! -f $treedir/final.mdl ]; then + echo "$0: $treedir/final.mdl does not exist." + exit 1 +fi + +diff $treedir/tree $chaindir/tree || { echo "$0: $treedir/tree and $chaindir/tree differ"; exit 1; } + +dir=$exp/chain${nnet3_affix}/tdnn_lstm${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} + +#if [ $stage -le 9 ]; then +# steps/subset_ali_dir.sh --cmd "$train_cmd" \ +# data/${unsupervised_set} data/${unsupervised_set}_sp_hires \ +# $chaindir/best_path_${unsupervised_set}_sp${decode_affix} \ +# $chaindir/best_path_${unsupervised_set}${decode_affix} +# echo $frame_subsampling_factor > $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +#fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}_sp${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm4 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + output-layer name=output input=lstm4 output-delay=$label_delay dim=$num_targets include-log-softmax=false max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine@$label_delay skip-in-init=true + output name=output-1 input=output.affine@$label_delay skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax@$label_delay skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax@$label_delay skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + chunk_left_context] +right_context=$[model_right_context + chunk_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +egs_left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +egs_right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +egs_left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +egs_right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri4a_${supervised_set}_unk_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} + +if [ -z "$unsup_egs_dir" ]; then + [ -z $unsup_frames_per_eg ] && [ ! -z "$frames_per_eg" ] && unsup_frames_per_eg=$frames_per_eg + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + if $use_smart_splitting; then + get_egs_script=steps/nnet3/chain/get_egs_split.sh + else + get_egs_script=steps/nnet3/chain/get_egs.sh + fi + + $get_egs_script --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 64 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. --lang2num-copies "$num_copies" \ + 2 $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --chain.smbr-leaky-hmm-coefficient 0.00001 \ + --chain.mmi-factor-schedule="output-0=1.0,1.0 output-1=1.0,1.0" \ + --chain.smbr-factor-schedule="output-0=0.0,0.0 output-1=0.0,0.0" \ + --chain.smbr-extra-opts="$chain_smbr_extra_opts" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir --lang data/lang_chain_unk || exit 1; +fi + +graph_dir=$dir/graph${test_graph_affix} +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 $test_lang $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --frames-per-chunk 160 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + $graph_dir data/${decode_set}_hires $dir/decode${test_graph_affix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --frames-per-chunk 150 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_250k_ex250k_semisupervised_conf_smbr_a.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_250k_ex250k_semisupervised_conf_smbr_a.sh new file mode 100644 index 00000000000..6f0a5f932fb --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_250k_ex250k_semisupervised_conf_smbr_a.sh @@ -0,0 +1,523 @@ +#!/bin/bash + +# Unsupervised set: train_unsup100k_250k +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 3,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=80 +exp=exp/semisup_100k + +supervised_set=train_sup +unsupervised_set=train_unsup100k_250k +semisup_train_set= # semisup100k_250k + +tdnn_affix=7d_h1024 # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" +tree_affix=bi_c + +nnet3_affix= # affix for nnet3 and chain dir -- relates to i-vector used + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg=150 # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb_250k_ex250k_1e_smbr # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +train_extra_opts="--chain.mmi-factor-schedule=1.0,1.0@0.2,0.5@0.2,0.5 --chain.smbr-factor-schedule=0.0,0.0@0.2,0.2@0.2,0.2" +chain_smbr_extra_opts="--one-silence-class" +lm_weights=3,2 +num_copies= +sup_egs_dir= +unsup_egs_dir= +unsup_egs_opts= + +remove_egs=false +common_egs_dir= + +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +apply_deriv_weights=true +use_smart_splitting=true + +# training options +num_epochs=4 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +self_repair_scale=0.00001 +label_delay=5 + +# decode options +extra_left_context=50 +extra_right_context=0 + +decode_iter= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +if $use_smart_splitting; then + comb_affix=${comb_affix:+${comb_affix}_smart} +else + comb_affix=${comb_affix:+${comb_affix}_naive} +fi + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}_sp${decode_affix}/frame_subsampling_factor +fi + +cmvn_opts=`cat $chaindir/cmvn_opts` || exit 1 + +sup_ali_dir=$exp/tri4a + +treedir=$exp/chain${nnet3_affix}/tree_${tree_affix} +if [ ! -f $treedir/final.mdl ]; then + echo "$0: $treedir/final.mdl does not exist." + exit 1 +fi + +diff $treedir/tree $chaindir/tree || { echo "$0: $treedir/tree and $chaindir/tree differ"; exit 1; } + +dir=$exp/chain${nnet3_affix}/tdnn_lstm${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} + +#if [ $stage -le 9 ]; then +# steps/subset_ali_dir.sh --cmd "$train_cmd" \ +# data/${unsupervised_set} data/${unsupervised_set}_sp_hires \ +# $chaindir/best_path_${unsupervised_set}_sp${decode_affix} \ +# $chaindir/best_path_${unsupervised_set}${decode_affix} +# echo $frame_subsampling_factor > $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +#fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}_sp${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm4 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + output-layer name=output input=lstm4 output-delay=$label_delay dim=$num_targets include-log-softmax=false max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine@$label_delay skip-in-init=true + output name=output-1 input=output.affine@$label_delay skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax@$label_delay skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax@$label_delay skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + chunk_left_context] +right_context=$[model_right_context + chunk_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +egs_left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +egs_right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +egs_left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +egs_right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri4a_${supervised_set}_unk_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} + +if [ -z "$unsup_egs_dir" ]; then + [ -z $unsup_frames_per_eg ] && [ ! -z "$frames_per_eg" ] && unsup_frames_per_eg=$frames_per_eg + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + if $use_smart_splitting; then + get_egs_script=steps/nnet3/chain/get_egs_split.sh + else + get_egs_script=steps/nnet3/chain/get_egs.sh + fi + + $get_egs_script --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 64 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. --lang2num-copies "$num_copies" \ + 2 $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + $train_extra_opts --chain.smbr-extra-opts="$chain_smbr_extra_opts" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir --lang data/lang_chain_unk || exit 1; +fi + +graph_dir=$dir/graph${test_graph_affix} +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 $test_lang $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --frames-per-chunk 160 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + $graph_dir data/${decode_set}_hires $dir/decode${test_graph_affix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --frames-per-chunk 150 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_250k_ex250k_semisupervised_conf_smbr_b.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_250k_ex250k_semisupervised_conf_smbr_b.sh new file mode 100644 index 00000000000..37362657651 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_250k_ex250k_semisupervised_conf_smbr_b.sh @@ -0,0 +1,529 @@ +#!/bin/bash + +# Unsupervised set: train_unsup100k_250k +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 3,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=80 +exp=exp/semisup_100k + +supervised_set=train_sup +unsupervised_set=train_unsup100k_250k +semisup_train_set= # semisup100k_250k + +tdnn_affix=7d_h1024 # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" +tree_affix=bi_c + +nnet3_affix= # affix for nnet3 and chain dir -- relates to i-vector used + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg=150 # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +phone_insertion_penalty= + +smbr_leaky_hmm_coefficient=0.00001 +mmi_factor_schedule="output-0=1.0,1.0 output-1=0.2,0.2" +smbr_factor_schedule="output-0=0.0,0.0 output-1=0.4,0.4" + +# Semi-supervised options +comb_affix=comb_250k_ex250k_1b_smbr # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +chain_smbr_extra_opts="--one-silence-class" +lm_weights=3,2 +num_copies= +sup_egs_dir= +unsup_egs_dir= +unsup_egs_opts= + +remove_egs=false +common_egs_dir= + +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +apply_deriv_weights=true +use_smart_splitting=true + +# training options +num_epochs=4 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +self_repair_scale=0.00001 +label_delay=5 + +# decode options +extra_left_context=50 +extra_right_context=0 + +decode_iter= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +if $use_smart_splitting; then + comb_affix=${comb_affix:+${comb_affix}_smart} +else + comb_affix=${comb_affix:+${comb_affix}_naive} +fi + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}_sp${decode_affix}/frame_subsampling_factor +fi + +cmvn_opts=`cat $chaindir/cmvn_opts` || exit 1 + +sup_ali_dir=$exp/tri4a + +treedir=$exp/chain${nnet3_affix}/tree_${tree_affix} +if [ ! -f $treedir/final.mdl ]; then + echo "$0: $treedir/final.mdl does not exist." + exit 1 +fi + +diff $treedir/tree $chaindir/tree || { echo "$0: $treedir/tree and $chaindir/tree differ"; exit 1; } + +dir=$exp/chain${nnet3_affix}/tdnn_lstm${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} + +#if [ $stage -le 9 ]; then +# steps/subset_ali_dir.sh --cmd "$train_cmd" \ +# data/${unsupervised_set} data/${unsupervised_set}_sp_hires \ +# $chaindir/best_path_${unsupervised_set}_sp${decode_affix} \ +# $chaindir/best_path_${unsupervised_set}${decode_affix} +# echo $frame_subsampling_factor > $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +#fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}_sp${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm4 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + output-layer name=output input=lstm4 output-delay=$label_delay dim=$num_targets include-log-softmax=false max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine@$label_delay skip-in-init=true + output name=output-1 input=output.affine@$label_delay skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax@$label_delay skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax@$label_delay skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + chunk_left_context] +right_context=$[model_right_context + chunk_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +egs_left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +egs_right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +egs_left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +egs_right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri4a_${supervised_set}_unk_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} + +if [ -z "$unsup_egs_dir" ]; then + [ -z $unsup_frames_per_eg ] && [ ! -z "$frames_per_eg" ] && unsup_frames_per_eg=$frames_per_eg + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + if $use_smart_splitting; then + get_egs_script=steps/nnet3/chain/get_egs_split.sh + else + get_egs_script=steps/nnet3/chain/get_egs.sh + fi + + $get_egs_script --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 64 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. --lang2num-copies "$num_copies" \ + 2 $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --chain.smbr-leaky-hmm-coefficient $smbr_leaky_hmm_coefficient \ + --chain.mmi-factor-schedule="$mmi_factor_schedule" \ + --chain.smbr-factor-schedule="$smbr_factor_schedule" \ + --chain.smbr-extra-opts="$chain_smbr_extra_opts" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir --lang data/lang_chain_unk || exit 1; +fi + +graph_dir=$dir/graph${test_graph_affix} +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 $test_lang $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --frames-per-chunk 160 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + $graph_dir data/${decode_set}_hires $dir/decode${test_graph_affix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --frames-per-chunk 150 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_250k_semisupervised_conf_d.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_250k_semisupervised_conf_d.sh new file mode 100644 index 00000000000..f5f41fd67c1 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_250k_semisupervised_conf_d.sh @@ -0,0 +1,517 @@ +#!/bin/bash + +# Unsupervised set: train_unsup100k_250k +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 3,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=80 +exp=exp/semisup_100k + +supervised_set=train_sup +unsupervised_set=train_unsup100k_250k +semisup_train_set= # semisup100k_250k + +tdnn_affix=7c # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" +tree_affix=bi_c + +nnet3_affix= # affix for nnet3 and chain dir -- relates to i-vector used + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg=150 # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb_250k_1d # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=3,2 +num_copies= +sup_egs_dir= +unsup_egs_dir= +unsup_egs_opts= + +remove_egs=false +common_egs_dir= + +hidden_dim=768 +cell_dim=768 +projection_dim=192 + +apply_deriv_weights=true +use_smart_splitting=true + +# training options +num_epochs=4 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +self_repair_scale=0.00001 +label_delay=5 + +# decode options +extra_left_context=50 +extra_right_context=0 + +decode_iter= + +do_finetuning=false + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +if $use_smart_splitting; then + comb_affix=${comb_affix:+${comb_affix}_smart} +else + comb_affix=${comb_affix:+${comb_affix}_naive} +fi + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}_sp${decode_affix}/frame_subsampling_factor +fi + +cmvn_opts=`cat $chaindir/cmvn_opts` || exit 1 + +sup_ali_dir=$exp/tri4a + +treedir=$exp/chain${nnet3_affix}/tree_${tree_affix} +if [ ! -f $treedir/final.mdl ]; then + echo "$0: $treedir/final.mdl does not exist." + exit 1 +fi + +diff $treedir/tree $chaindir/tree || { echo "$0: $treedir/tree and $chaindir/tree differ"; exit 1; } + +dir=$exp/chain${nnet3_affix}/tdnn_lstm${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} + +#if [ $stage -le 9 ]; then +# steps/subset_ali_dir.sh --cmd "$train_cmd" \ +# data/${unsupervised_set} data/${unsupervised_set}_sp_hires \ +# $chaindir/best_path_${unsupervised_set}_sp${decode_affix} \ +# $chaindir/best_path_${unsupervised_set}${decode_affix} +# echo $frame_subsampling_factor > $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +#fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}_sp${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm4 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + output-layer name=output input=lstm4 output-delay=$label_delay dim=$num_targets include-log-softmax=false max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine@$label_delay skip-in-init=true + output name=output-1 input=output.affine@$label_delay skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax@$label_delay skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax@$label_delay skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + chunk_left_context] +right_context=$[model_right_context + chunk_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +egs_left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +egs_right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +egs_left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +egs_right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri4a_${supervised_set}_unk_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} + +if [ -z "$unsup_egs_dir" ]; then + [ -z $unsup_frames_per_eg ] && [ ! -z "$frames_per_eg" ] && unsup_frames_per_eg=$frames_per_eg + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + if $use_smart_splitting; then + get_egs_script=steps/nnet3/chain/get_egs_split.sh + else + get_egs_script=steps/nnet3/chain/get_egs.sh + fi + + $get_egs_script --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 64 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. --lang2num-copies "$num_copies" \ + 2 $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph${test_graph_affix} +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 $test_lang $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --frames-per-chunk 160 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + $graph_dir data/${decode_set}_hires $dir/decode${test_graph_affix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --frames-per-chunk 150 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_250k_semisupervised_conf_e.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_250k_semisupervised_conf_e.sh new file mode 100644 index 00000000000..61c55686efe --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_250k_semisupervised_conf_e.sh @@ -0,0 +1,516 @@ +#!/bin/bash + +# Unsupervised set: train_unsup100k_250k +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 3,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=80 +exp=exp/semisup_100k + +supervised_set=train_sup +unsupervised_set=train_unsup100k_250k +semisup_train_set= # semisup100k_250k + +tdnn_affix=7d_h1024 # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" +tree_affix=bi_c + +nnet3_affix= # affix for nnet3 and chain dir -- relates to i-vector used + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg=150 # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb_250k_1e # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=3,2 +num_copies= +sup_egs_dir= +unsup_egs_dir= +unsup_egs_opts= + +remove_egs=false +common_egs_dir= + +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +apply_deriv_weights=true +use_smart_splitting=true + +# training options +num_epochs=4 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +self_repair_scale=0.00001 +label_delay=5 + +# decode options +extra_left_context=50 +extra_right_context=0 + +decode_iter= + +do_finetuning=false + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +if $use_smart_splitting; then + comb_affix=${comb_affix:+${comb_affix}_smart} +else + comb_affix=${comb_affix:+${comb_affix}_naive} +fi + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}_sp${decode_affix}/frame_subsampling_factor +fi + +cmvn_opts=`cat $chaindir/cmvn_opts` || exit 1 + +sup_ali_dir=$exp/tri4a + +treedir=$exp/chain${nnet3_affix}/tree_${tree_affix} +if [ ! -f $treedir/final.mdl ]; then + echo "$0: $treedir/final.mdl does not exist." + exit 1 +fi + +diff $treedir/tree $chaindir/tree || { echo "$0: $treedir/tree and $chaindir/tree differ"; exit 1; } + +dir=$exp/chain${nnet3_affix}/tdnn_lstm${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} + +#if [ $stage -le 9 ]; then +# steps/subset_ali_dir.sh --cmd "$train_cmd" \ +# data/${unsupervised_set} data/${unsupervised_set}_sp_hires \ +# $chaindir/best_path_${unsupervised_set}_sp${decode_affix} \ +# $chaindir/best_path_${unsupervised_set}${decode_affix} +# echo $frame_subsampling_factor > $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +#fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}_sp${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm4 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + output-layer name=output input=lstm4 output-delay=$label_delay dim=$num_targets include-log-softmax=false max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine@$label_delay skip-in-init=true + output name=output-1 input=output.affine@$label_delay skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax@$label_delay skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax@$label_delay skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + chunk_left_context] +right_context=$[model_right_context + chunk_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +egs_left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +egs_right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +egs_left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +egs_right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri4a_${supervised_set}_unk_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} + +if [ -z "$unsup_egs_dir" ]; then + [ -z $unsup_frames_per_eg ] && [ ! -z "$frames_per_eg" ] && unsup_frames_per_eg=$frames_per_eg + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + if $use_smart_splitting; then + get_egs_script=steps/nnet3/chain/get_egs_split.sh + else + get_egs_script=steps/nnet3/chain/get_egs.sh + fi + + $get_egs_script --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 64 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. --lang2num-copies "$num_copies" \ + 2 $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph${test_graph_affix} +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 $test_lang $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --frames-per-chunk 160 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + $graph_dir data/${decode_set}_hires $dir/decode${test_graph_affix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --frames-per-chunk 150 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_250k_semisupervised_conf_f.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_250k_semisupervised_conf_f.sh new file mode 100644 index 00000000000..064cc03a00e --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_250k_semisupervised_conf_f.sh @@ -0,0 +1,516 @@ +#!/bin/bash + +# Unsupervised set: train_unsup100k_250k +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 3,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=80 +exp=exp/semisup_100k + +supervised_set=train_sup +unsupervised_set=train_unsup100k_250k +semisup_train_set= # semisup100k_250k + +tdnn_affix=7g # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" +tree_affix=bi_c + +nnet3_affix= # affix for nnet3 and chain dir -- relates to i-vector used + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg=150 # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb_250k_1f # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=3,2 +num_copies= +sup_egs_dir= +unsup_egs_dir= +unsup_egs_opts= + +remove_egs=false +common_egs_dir= + +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +apply_deriv_weights=true +use_smart_splitting=true + +# training options +num_epochs=4 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +self_repair_scale=0.00001 +label_delay=5 + +# decode options +extra_left_context=50 +extra_right_context=0 + +decode_iter= + +do_finetuning=false + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +if $use_smart_splitting; then + comb_affix=${comb_affix:+${comb_affix}_smart} +else + comb_affix=${comb_affix:+${comb_affix}_naive} +fi + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}_sp${decode_affix}/frame_subsampling_factor +fi + +cmvn_opts=`cat $chaindir/cmvn_opts` || exit 1 + +sup_ali_dir=$exp/tri4a + +treedir=$exp/chain${nnet3_affix}/tree_${tree_affix} +if [ ! -f $treedir/final.mdl ]; then + echo "$0: $treedir/final.mdl does not exist." + exit 1 +fi + +diff $treedir/tree $chaindir/tree || { echo "$0: $treedir/tree and $chaindir/tree differ"; exit 1; } + +dir=$exp/chain${nnet3_affix}/tdnn_lstm${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} + +#if [ $stage -le 9 ]; then +# steps/subset_ali_dir.sh --cmd "$train_cmd" \ +# data/${unsupervised_set} data/${unsupervised_set}_sp_hires \ +# $chaindir/best_path_${unsupervised_set}_sp${decode_affix} \ +# $chaindir/best_path_${unsupervised_set}${decode_affix} +# echo $frame_subsampling_factor > $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +#fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}_sp${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm4 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + output-layer name=output input=lstm4 output-delay=$label_delay dim=$num_targets include-log-softmax=false max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine@$label_delay skip-in-init=true + output name=output-1 input=output.affine@$label_delay skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax@$label_delay skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax@$label_delay skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + chunk_left_context] +right_context=$[model_right_context + chunk_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +egs_left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +egs_right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +egs_left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +egs_right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri4a_${supervised_set}_unk_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --generate-egs-scp true --constrained false \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} + +if [ -z "$unsup_egs_dir" ]; then + [ -z $unsup_frames_per_eg ] && [ ! -z "$frames_per_eg" ] && unsup_frames_per_eg=$frames_per_eg + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + if $use_smart_splitting; then + get_egs_script=steps/nnet3/chain/get_egs_split.sh + else + get_egs_script=steps/nnet3/chain/get_egs.sh + fi + + $get_egs_script --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires \ + --generate-egs-scp true --constrained false $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 64 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. --lang2num-copies "$num_copies" \ + 2 $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph${test_graph_affix} +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 $test_lang $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --frames-per-chunk 160 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + $graph_dir data/${decode_set}_hires $dir/decode${test_graph_affix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --frames-per-chunk 150 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_250k_semisupervised_conf_g.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_250k_semisupervised_conf_g.sh new file mode 100644 index 00000000000..70a62d8b152 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_250k_semisupervised_conf_g.sh @@ -0,0 +1,516 @@ +#!/bin/bash + +# Unsupervised set: train_unsup100k_250k +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 3,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=80 +exp=exp/semisup_100k + +supervised_set=train_sup +unsupervised_set=train_unsup100k_250k +semisup_train_set= # semisup100k_250k + +tdnn_affix=7f # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" +tree_affix=bi_c + +nnet3_affix= # affix for nnet3 and chain dir -- relates to i-vector used + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg=150 # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb_250k_1g # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=3,2 +num_copies= +sup_egs_dir= +unsup_egs_dir= +unsup_egs_opts= + +remove_egs=false +common_egs_dir= + +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +apply_deriv_weights=true +use_smart_splitting=true + +# training options +num_epochs=4 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +self_repair_scale=0.00001 +label_delay=5 + +# decode options +extra_left_context=50 +extra_right_context=0 + +decode_iter= + +do_finetuning=false + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +if $use_smart_splitting; then + comb_affix=${comb_affix:+${comb_affix}_smart} +else + comb_affix=${comb_affix:+${comb_affix}_naive} +fi + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}_sp${decode_affix}/frame_subsampling_factor +fi + +cmvn_opts=`cat $chaindir/cmvn_opts` || exit 1 + +sup_ali_dir=$exp/tri4a + +treedir=$exp/chain${nnet3_affix}/tree_${tree_affix} +if [ ! -f $treedir/final.mdl ]; then + echo "$0: $treedir/final.mdl does not exist." + exit 1 +fi + +diff $treedir/tree $chaindir/tree || { echo "$0: $treedir/tree and $chaindir/tree differ"; exit 1; } + +dir=$exp/chain${nnet3_affix}/tdnn_lstm${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} + +#if [ $stage -le 9 ]; then +# steps/subset_ali_dir.sh --cmd "$train_cmd" \ +# data/${unsupervised_set} data/${unsupervised_set}_sp_hires \ +# $chaindir/best_path_${unsupervised_set}_sp${decode_affix} \ +# $chaindir/best_path_${unsupervised_set}${decode_affix} +# echo $frame_subsampling_factor > $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +#fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}_sp${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm4 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + output-layer name=output input=lstm4 output-delay=$label_delay dim=$num_targets include-log-softmax=false max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine@$label_delay skip-in-init=true + output name=output-1 input=output.affine@$label_delay skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax@$label_delay skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax@$label_delay skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + chunk_left_context] +right_context=$[model_right_context + chunk_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +egs_left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +egs_right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +egs_left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +egs_right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri4a_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --generate-egs-scp true --constrained false \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} + +if [ -z "$unsup_egs_dir" ]; then + [ -z $unsup_frames_per_eg ] && [ ! -z "$frames_per_eg" ] && unsup_frames_per_eg=$frames_per_eg + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + if $use_smart_splitting; then + get_egs_script=steps/nnet3/chain/get_egs_split.sh + else + get_egs_script=steps/nnet3/chain/get_egs.sh + fi + + $get_egs_script --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires \ + --generate-egs-scp true --constrained false $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 64 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. --lang2num-copies "$num_copies" \ + 2 $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph${test_graph_affix} +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 $test_lang $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --frames-per-chunk 160 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + $graph_dir data/${decode_set}_hires $dir/decode${test_graph_affix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --frames-per-chunk 150 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_250k_semisupervised_conf_smbr_a.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_250k_semisupervised_conf_smbr_a.sh new file mode 100644 index 00000000000..23aa3531377 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_250k_semisupervised_conf_smbr_a.sh @@ -0,0 +1,512 @@ +#!/bin/bash + +# Unsupervised set: train_unsup100k_250k +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 3,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=80 +exp=exp/semisup_100k + +supervised_set=train_sup +unsupervised_set=train_unsup100k_250k +semisup_train_set= # semisup100k_250k + +tdnn_affix=7d_h1024 # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" +tree_affix=bi_c + +nnet3_affix= # affix for nnet3 and chain dir -- relates to i-vector used + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg=150 # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb_250k_1b_smbr # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +chain_smbr_extra_opts="--one-silence-class" +lm_weights=3,2 +num_copies= +sup_egs_dir= +unsup_egs_dir= +unsup_egs_opts= + +remove_egs=false +common_egs_dir= + +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +apply_deriv_weights=true +use_smart_splitting=true + +# training options +num_epochs=4 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +self_repair_scale=0.00001 +label_delay=5 + +# decode options +extra_left_context=50 +extra_right_context=0 + +decode_iter= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +if $use_smart_splitting; then + comb_affix=${comb_affix:+${comb_affix}_smart} +else + comb_affix=${comb_affix:+${comb_affix}_naive} +fi + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}_sp${decode_affix}/frame_subsampling_factor +fi + +cmvn_opts=`cat $chaindir/cmvn_opts` || exit 1 + +sup_ali_dir=$exp/tri4a + +treedir=$exp/chain${nnet3_affix}/tree_${tree_affix} +if [ ! -f $treedir/final.mdl ]; then + echo "$0: $treedir/final.mdl does not exist." + exit 1 +fi + +diff $treedir/tree $chaindir/tree || { echo "$0: $treedir/tree and $chaindir/tree differ"; exit 1; } + +dir=$exp/chain${nnet3_affix}/tdnn_lstm${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} + +#if [ $stage -le 9 ]; then +# steps/subset_ali_dir.sh --cmd "$train_cmd" \ +# data/${unsupervised_set} data/${unsupervised_set}_sp_hires \ +# $chaindir/best_path_${unsupervised_set}_sp${decode_affix} \ +# $chaindir/best_path_${unsupervised_set}${decode_affix} +# echo $frame_subsampling_factor > $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +#fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}_sp${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm4 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + output-layer name=output input=lstm4 output-delay=$label_delay dim=$num_targets include-log-softmax=false max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine@$label_delay skip-in-init=true + output name=output-1 input=output.affine@$label_delay skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax@$label_delay skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax@$label_delay skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + chunk_left_context] +right_context=$[model_right_context + chunk_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +egs_left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +egs_right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +egs_left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +egs_right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri4a_${supervised_set}_unk_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} + +if [ -z "$unsup_egs_dir" ]; then + [ -z $unsup_frames_per_eg ] && [ ! -z "$frames_per_eg" ] && unsup_frames_per_eg=$frames_per_eg + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + if $use_smart_splitting; then + get_egs_script=steps/nnet3/chain/get_egs_split.sh + else + get_egs_script=steps/nnet3/chain/get_egs.sh + fi + + $get_egs_script --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 64 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. --lang2num-copies "$num_copies" \ + 2 $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --chain.smbr-leaky-hmm-coefficient 0.00001 \ + --chain.mmi-factor-schedule="output-0=1.0,1.0 output-1=0.5,0.5" \ + --chain.smbr-factor-schedule="output-0=0.0,0.0 output-1=0.2,0.2" \ + --chain.smbr-extra-opts="$chain_smbr_extra_opts" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir --lang data/lang_chain_unk || exit 1; +fi + +graph_dir=$dir/graph${test_graph_affix} +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 $test_lang $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --frames-per-chunk 160 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + $graph_dir data/${decode_set}_hires $dir/decode${test_graph_affix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --frames-per-chunk 150 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_250k_semisupervised_conf_smbr_b.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_250k_semisupervised_conf_smbr_b.sh new file mode 100644 index 00000000000..d5f49eda8e5 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_250k_semisupervised_conf_smbr_b.sh @@ -0,0 +1,516 @@ +#!/bin/bash + +# Unsupervised set: train_unsup100k_250k +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 3,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=80 +exp=exp/semisup_100k + +supervised_set=train_sup +unsupervised_set=train_unsup100k_250k +semisup_train_set= # semisup100k_250k + +tdnn_affix=7d_h1024 # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" +tree_affix=bi_c + +nnet3_affix= # affix for nnet3 and chain dir -- relates to i-vector used + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg=150 # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +phone_insertion_penalty= + +smbr_leaky_hmm_coefficient=0.00001 +mmi_factor_schedule="output-0=1.0,1.0 output-1=0.2,0.2" +smbr_factor_schedule="output-0=0.0,0.0 output-1=0.4,0.4" + +# Semi-supervised options +comb_affix=comb_250k_1b_smbr # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +chain_smbr_extra_opts="--one-silence-class" +lm_weights=3,2 +num_copies= +sup_egs_dir= +unsup_egs_dir= +unsup_egs_opts= + +remove_egs=false +common_egs_dir= + +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +apply_deriv_weights=true +use_smart_splitting=true + +# training options +num_epochs=4 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +self_repair_scale=0.00001 +label_delay=5 + +# decode options +extra_left_context=50 +extra_right_context=0 + +decode_iter= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +if $use_smart_splitting; then + comb_affix=${comb_affix:+${comb_affix}_smart} +else + comb_affix=${comb_affix:+${comb_affix}_naive} +fi + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}_sp${decode_affix}/frame_subsampling_factor +fi + +cmvn_opts=`cat $chaindir/cmvn_opts` || exit 1 + +sup_ali_dir=$exp/tri4a + +treedir=$exp/chain${nnet3_affix}/tree_${tree_affix} +if [ ! -f $treedir/final.mdl ]; then + echo "$0: $treedir/final.mdl does not exist." + exit 1 +fi + +diff $treedir/tree $chaindir/tree || { echo "$0: $treedir/tree and $chaindir/tree differ"; exit 1; } + +dir=$exp/chain${nnet3_affix}/tdnn_lstm${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} + +#if [ $stage -le 9 ]; then +# steps/subset_ali_dir.sh --cmd "$train_cmd" \ +# data/${unsupervised_set} data/${unsupervised_set}_sp_hires \ +# $chaindir/best_path_${unsupervised_set}_sp${decode_affix} \ +# $chaindir/best_path_${unsupervised_set}${decode_affix} +# echo $frame_subsampling_factor > $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +#fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}_sp${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm4 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + output-layer name=output input=lstm4 output-delay=$label_delay dim=$num_targets include-log-softmax=false max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine@$label_delay skip-in-init=true + output name=output-1 input=output.affine@$label_delay skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax@$label_delay skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax@$label_delay skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + chunk_left_context] +right_context=$[model_right_context + chunk_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +egs_left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +egs_right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +egs_left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +egs_right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri4a_${supervised_set}_unk_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} + +if [ -z "$unsup_egs_dir" ]; then + [ -z $unsup_frames_per_eg ] && [ ! -z "$frames_per_eg" ] && unsup_frames_per_eg=$frames_per_eg + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + if $use_smart_splitting; then + get_egs_script=steps/nnet3/chain/get_egs_split.sh + else + get_egs_script=steps/nnet3/chain/get_egs.sh + fi + + $get_egs_script --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 64 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. --lang2num-copies "$num_copies" \ + 2 $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --chain.smbr-leaky-hmm-coefficient $smbr_leaky_hmm_coefficient \ + --chain.mmi-factor-schedule="$mmi_factor_schedule" \ + --chain.smbr-factor-schedule="$smbr_factor_schedule" \ + --chain.smbr-extra-opts="$chain_smbr_extra_opts" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir --lang data/lang_chain_unk || exit 1; +fi + +graph_dir=$dir/graph${test_graph_affix} +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 $test_lang $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --frames-per-chunk 160 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + $graph_dir data/${decode_set}_hires $dir/decode${test_graph_affix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --frames-per-chunk 150 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_500k_ex500k_semisupervised_smbr_a.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_500k_ex500k_semisupervised_smbr_a.sh new file mode 100644 index 00000000000..1a4c6760f9a --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_500k_ex500k_semisupervised_smbr_a.sh @@ -0,0 +1,531 @@ +#!/bin/bash + +# Unsupervised set: train_unsup100k_500k +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 3,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=80 +exp=exp/semisup_100k + +supervised_set=train_sup +unsupervised_set=train_unsup100k_500k +semisup_train_set= # semisup100k_500k + +tdnn_affix=7d_h1024 # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" +tree_affix=bi_c + +nnet3_affix= # affix for nnet3 and chain dir -- relates to i-vector used + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg=150 # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +phone_insertion_penalty= + +smbr_leaky_hmm_coefficient=0.00001 +mmi_factor_schedule="output-0=1.0,1.0 output-1=0.2,0.2" +smbr_factor_schedule="output-0=0.0,0.0 output-1=0.4,0.4" + +# Semi-supervised options +comb_affix=comb_500k_exp500k_1a_smbr # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +chain_smbr_extra_opts="--one-silence-class" +lm_weights=3,1 +num_copies=2,1 +sup_egs_dir= +unsup_egs_dir= +unsup_egs_opts= + +remove_egs=false +common_egs_dir= + +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +apply_deriv_weights=true +use_smart_splitting=true + +# training options +num_epochs=2 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +self_repair_scale=0.00001 +label_delay=5 + +# decode options +extra_left_context=50 +extra_right_context=0 + +decode_iter= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +if $use_smart_splitting; then + comb_affix=${comb_affix:+${comb_affix}_smart} +else + comb_affix=${comb_affix:+${comb_affix}_naive} +fi + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}_sp${decode_affix}/frame_subsampling_factor +fi + +cmvn_opts=`cat $chaindir/cmvn_opts` || exit 1 + +sup_ali_dir=$exp/tri4a + +treedir=$exp/chain${nnet3_affix}/tree_${tree_affix} +if [ ! -f $treedir/final.mdl ]; then + echo "$0: $treedir/final.mdl does not exist." + exit 1 +fi + +diff $treedir/tree $chaindir/tree || { echo "$0: $treedir/tree and $chaindir/tree differ"; exit 1; } + +dir=$exp/chain${nnet3_affix}/tdnn_lstm${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} + +#if [ $stage -le 9 ]; then +# steps/subset_ali_dir.sh --cmd "$train_cmd" \ +# data/${unsupervised_set} data/${unsupervised_set}_sp_hires \ +# $chaindir/best_path_${unsupervised_set}_sp${decode_affix} \ +# $chaindir/best_path_${unsupervised_set}${decode_affix} +# echo $frame_subsampling_factor > $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +#fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}_sp${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm4 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + output-layer name=output input=lstm4 output-delay=$label_delay dim=$num_targets include-log-softmax=false max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine@$label_delay + output name=output-1 input=output.affine@$label_delay + + output name=output-0-xent input=output-xent.log-softmax@$label_delay + output name=output-1-xent input=output-xent.log-softmax@$label_delay +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + chunk_left_context] +right_context=$[model_right_context + chunk_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +egs_left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +egs_right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +egs_left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +egs_right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri4a_${supervised_set}_unk_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} + +if [ -z "$unsup_egs_dir" ]; then + [ -z $unsup_frames_per_eg ] && [ ! -z "$frames_per_eg" ] && unsup_frames_per_eg=$frames_per_eg + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + if $use_smart_splitting; then + get_egs_script=steps/nnet3/chain/get_egs_split.sh + else + get_egs_script=steps/nnet3/chain/get_egs.sh + fi + + $get_egs_script --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/chain/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --block-size 64 \ + --lang2weight $supervision_weights --lang2num-copies "$num_copies" \ + 2 $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --chain.smbr-leaky-hmm-coefficient $smbr_leaky_hmm_coefficient \ + --chain.mmi-factor-schedule="$mmi_factor_schedule" \ + --chain.smbr-factor-schedule="$smbr_factor_schedule" \ + --chain.smbr-extra-opts="$chain_smbr_extra_opts" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir --lang data/lang_chain_unk || exit 1; +fi + +graph_dir=$dir/graph${test_graph_affix} +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 $test_lang $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --frames-per-chunk 160 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + $graph_dir data/${decode_set}_hires $dir/decode${test_graph_affix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --frames-per-chunk 150 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_500k_semisupervised_conf_d.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_500k_semisupervised_conf_d.sh new file mode 100644 index 00000000000..1ac9c770b3b --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_500k_semisupervised_conf_d.sh @@ -0,0 +1,516 @@ +#!/bin/bash + +# Unsupervised set: train_unsup100k_500k +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 3,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=80 +exp=exp/semisup_100k + +supervised_set=train_sup +unsupervised_set=train_unsup100k_500k +semisup_train_set= # semisup100k_500k + +tdnn_affix=7d_h1024 # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" +tree_affix=bi_c + +nnet3_affix= # affix for nnet3 and chain dir -- relates to i-vector used + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg=150 # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb_500k_1d # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=3,1 +num_copies=2,1 +sup_egs_dir= +unsup_egs_dir= +unsup_egs_opts= + +remove_egs=false +common_egs_dir= + +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +apply_deriv_weights=true +use_smart_splitting=true + +# training options +num_epochs=2 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +self_repair_scale=0.00001 +label_delay=5 + +# decode options +extra_left_context=50 +extra_right_context=0 + +decode_iter= + +do_finetuning=false + +finetune_stage=-2 +finetune_suffix=_finetune +finetune_iter=final +num_epochs_finetune=1 +finetune_xent_regularize=0.1 +finetune_opts="--chain.mmi-factor-schedule=0.05,0.05 --chain.smbr-factor-schedule=0.05,0.05" + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +if $use_smart_splitting; then + comb_affix=${comb_affix:+${comb_affix}_smart} +else + comb_affix=${comb_affix:+${comb_affix}_naive} +fi + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}_sp${decode_affix}/frame_subsampling_factor +fi + +cmvn_opts=`cat $chaindir/cmvn_opts` || exit 1 + +sup_ali_dir=$exp/tri4a + +treedir=$exp/chain${nnet3_affix}/tree_${tree_affix} +if [ ! -f $treedir/final.mdl ]; then + echo "$0: $treedir/final.mdl does not exist." + exit 1 +fi + +diff $treedir/tree $chaindir/tree || { echo "$0: $treedir/tree and $chaindir/tree differ"; exit 1; } + +dir=$exp/chain${nnet3_affix}/tdnn_lstm${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} + +#if [ $stage -le 9 ]; then +# steps/subset_ali_dir.sh --cmd "$train_cmd" \ +# data/${unsupervised_set} data/${unsupervised_set}_sp_hires \ +# $chaindir/best_path_${unsupervised_set}_sp${decode_affix} \ +# $chaindir/best_path_${unsupervised_set}${decode_affix} +# echo $frame_subsampling_factor > $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +#fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}_sp${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm4 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + output-layer name=output input=lstm4 output-delay=$label_delay dim=$num_targets include-log-softmax=false max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine@$label_delay skip-in-init=true + output name=output-1 input=output.affine@$label_delay skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax@$label_delay skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax@$label_delay skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + chunk_left_context] +right_context=$[model_right_context + chunk_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +egs_left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +egs_right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +egs_left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +egs_right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri4a_${supervised_set}_unk_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} + +if [ -z "$unsup_egs_dir" ]; then + [ -z $unsup_frames_per_eg ] && [ ! -z "$frames_per_eg" ] && unsup_frames_per_eg=$frames_per_eg + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + if $use_smart_splitting; then + get_egs_script=steps/nnet3/chain/get_egs_split.sh + else + get_egs_script=steps/nnet3/chain/get_egs.sh + fi + + $get_egs_script --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 64 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. --lang2num-copies "$num_copies" \ + 2 $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph${test_graph_affix} +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 $test_lang $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --frames-per-chunk 160 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + $graph_dir data/${decode_set}_hires $dir/decode${test_graph_affix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --frames-per-chunk 150 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_500k_semisupervised_conf_kl_a.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_500k_semisupervised_conf_kl_a.sh new file mode 100644 index 00000000000..e2a94495332 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_500k_semisupervised_conf_kl_a.sh @@ -0,0 +1,442 @@ +#!/bin/bash + +# Unsupervised set: train_unsup100k_500k +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 3,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=80 +exp=exp/semisup_100k + +supervised_set=train_sup +unsupervised_set=train_unsup100k_500k +semisup_train_set= # semisup100k_500k + +tdnn_affix=7d_h1024 # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" +tree_affix=bi_c + +nnet3_affix= # affix for nnet3 and chain dir -- relates to i-vector used + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg=150 # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +phone_insertion_penalty= + +mmi_factor_schedule="output-0=1.0,1.0 output-1=0.5,0.5" +kl_factor_schedule="output-0=0.0,0.0 output-1=0.5,0.5" + +# Semi-supervised options +comb_affix=comb_500k_1a_kl0.5 # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=3,1 +num_copies=2,1 +sup_egs_dir= +unsup_egs_dir= +unsup_egs_opts= + +remove_egs=false +common_egs_dir= + +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +apply_deriv_weights=true +use_smart_splitting=true + +# training options +num_epochs=2 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +self_repair_scale=0.00001 +label_delay=5 + +# decode options +extra_left_context=50 +extra_right_context=0 + +decode_iter= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +if $use_smart_splitting; then + comb_affix=${comb_affix:+${comb_affix}_smart} +else + comb_affix=${comb_affix:+${comb_affix}_naive} +fi + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}_sp${decode_affix}/frame_subsampling_factor +fi + +cmvn_opts=`cat $chaindir/cmvn_opts` || exit 1 + +sup_ali_dir=$exp/tri4a + +treedir=$exp/chain${nnet3_affix}/tree_${tree_affix} +if [ ! -f $treedir/final.mdl ]; then + echo "$0: $treedir/final.mdl does not exist." + exit 1 +fi + +diff $treedir/tree $chaindir/tree || { echo "$0: $treedir/tree and $chaindir/tree differ"; exit 1; } + +dir=$exp/chain${nnet3_affix}/tdnn_lstm${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} + +#if [ $stage -le 9 ]; then +# steps/subset_ali_dir.sh --cmd "$train_cmd" \ +# data/${unsupervised_set} data/${unsupervised_set}_sp_hires \ +# $chaindir/best_path_${unsupervised_set}_sp${decode_affix} \ +# $chaindir/best_path_${unsupervised_set}${decode_affix} +# echo $frame_subsampling_factor > $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +#fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}_sp${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm4 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + output-layer name=output input=lstm4 output-delay=$label_delay dim=$num_targets include-log-softmax=false max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine@$label_delay + output name=output-1 input=output.affine@$label_delay + + output name=output-0-xent input=output-xent.log-softmax@$label_delay + output name=output-1-xent input=output-xent.log-softmax@$label_delay +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + chunk_left_context] +right_context=$[model_right_context + chunk_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +egs_left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +egs_right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +egs_left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +egs_right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri4a_${supervised_set}_unk_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} + +if [ -z "$unsup_egs_dir" ]; then + [ -z $unsup_frames_per_eg ] && [ ! -z "$frames_per_eg" ] && unsup_frames_per_eg=$frames_per_eg + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + if $use_smart_splitting; then + get_egs_script=steps/nnet3/chain/get_egs_split.sh + else + get_egs_script=steps/nnet3/chain/get_egs.sh + fi + + $get_egs_script --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/chain/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --block-size 128 \ + --lang2weight $supervision_weights --lang2num-copies "$num_copies" \ + 2 $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --chain.mmi-factor-schedule="$mmi_factor_schedule" \ + --chain.kl-factor-schedule="$kl_factor_schedule" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph${test_graph_affix} +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 $test_lang $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --frames-per-chunk 160 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + $graph_dir data/${decode_set}_hires $dir/decode${test_graph_affix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_500k_semisupervised_conf_smbr_b.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_500k_semisupervised_conf_smbr_b.sh new file mode 100644 index 00000000000..25ddcb186b4 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_500k_semisupervised_conf_smbr_b.sh @@ -0,0 +1,518 @@ +#!/bin/bash + +# Unsupervised set: train_unsup100k_500k +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 3,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=80 +exp=exp/semisup_100k + +supervised_set=train_sup +unsupervised_set=train_unsup100k_500k +semisup_train_set= # semisup100k_500k + +tdnn_affix=7d_h1024 # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" +tree_affix=bi_c + +nnet3_affix= # affix for nnet3 and chain dir -- relates to i-vector used + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg=150 # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +phone_insertion_penalty= + +smbr_leaky_hmm_coefficient=0.00001 +mmi_factor_schedule="output-0=1.0,1.0 output-1=0.2,0.2" +smbr_factor_schedule="output-0=0.0,0.0 output-1=0.4,0.4" + +# Semi-supervised options +comb_affix=comb_500k_1b_smbr # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +chain_smbr_extra_opts="--one-silence-class" +lm_weights=3,1 +num_copies= +sup_egs_dir= +unsup_egs_dir= +unsup_egs_opts= + +remove_egs=false +common_egs_dir= + +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +apply_deriv_weights=true +use_smart_splitting=true + +# training options +num_epochs=4 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +self_repair_scale=0.00001 +label_delay=5 + +# decode options +extra_left_context=50 +extra_right_context=0 + +decode_iter= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +if $use_smart_splitting; then + comb_affix=${comb_affix:+${comb_affix}_smart} +else + comb_affix=${comb_affix:+${comb_affix}_naive} +fi + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}_sp${decode_affix}/frame_subsampling_factor +fi + +cmvn_opts=`cat $chaindir/cmvn_opts` || exit 1 + +sup_ali_dir=$exp/tri4a + +treedir=$exp/chain${nnet3_affix}/tree_${tree_affix} +if [ ! -f $treedir/final.mdl ]; then + echo "$0: $treedir/final.mdl does not exist." + exit 1 +fi + +diff $treedir/tree $chaindir/tree || { echo "$0: $treedir/tree and $chaindir/tree differ"; exit 1; } + +dir=$exp/chain${nnet3_affix}/tdnn_lstm${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} + +#if [ $stage -le 9 ]; then +# steps/subset_ali_dir.sh --cmd "$train_cmd" \ +# data/${unsupervised_set} data/${unsupervised_set}_sp_hires \ +# $chaindir/best_path_${unsupervised_set}_sp${decode_affix} \ +# $chaindir/best_path_${unsupervised_set}${decode_affix} +# echo $frame_subsampling_factor > $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +#fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}_sp${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm4 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + output-layer name=output input=lstm4 output-delay=$label_delay dim=$num_targets include-log-softmax=false max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine@$label_delay skip-in-init=true + output name=output-1 input=output.affine@$label_delay skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax@$label_delay skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax@$label_delay skip-in-init=true +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + chunk_left_context] +right_context=$[model_right_context + chunk_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +egs_left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +egs_right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +egs_left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +egs_right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri4a_${supervised_set}_unk_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} + +if [ -z "$unsup_egs_dir" ]; then + [ -z $unsup_frames_per_eg ] && [ ! -z "$frames_per_eg" ] && unsup_frames_per_eg=$frames_per_eg + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + if $use_smart_splitting; then + get_egs_script=steps/nnet3/chain/get_egs_split.sh + else + get_egs_script=steps/nnet3/chain/get_egs.sh + fi + + $get_egs_script --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 64 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. --lang2num-copies "$num_copies" \ + 2 $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --chain.smbr-leaky-hmm-coefficient $smbr_leaky_hmm_coefficient \ + --chain.mmi-factor-schedule="$mmi_factor_schedule" \ + --chain.smbr-factor-schedule="$smbr_factor_schedule" \ + --chain.smbr-extra-opts="$chain_smbr_extra_opts" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir --lang data/lang_chain_unk || exit 1; +fi + +graph_dir=$dir/graph${test_graph_affix} +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 $test_lang $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --frames-per-chunk 160 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + $graph_dir data/${decode_set}_hires $dir/decode${test_graph_affix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +if ! $do_finetuning; then + wait + exit 0 +fi + +if [ $stage -le 19 ]; then + mkdir -p ${dir}${finetune_suffix} + + for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do + cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 + done + cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 + + nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ + $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw + + if [ $finetune_stage -le -1 ]; then + finetune_stage=-1 + fi + + steps/nnet3/chain/train.py --stage $finetune_stage \ + --trainer.input-model ${dir}${finetune_suffix}/init.raw \ + --egs.dir "$sup_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ + --chain.xent-regularize $finetune_xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --trainer.num-chunk-per-minibatch "150=64/300=32" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs_finetune \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --trainer.optimization.do-final-combination false \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir ${dir}${finetune_suffix} || exit 1; +fi + +dir=${dir}${finetune_suffix} + +if [ $stage -le 20 ]; then + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --frames-per-chunk 150 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_500k_semisupervised_conf_smbr_c.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_500k_semisupervised_conf_smbr_c.sh new file mode 100644 index 00000000000..66c89bb67d2 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_500k_semisupervised_conf_smbr_c.sh @@ -0,0 +1,446 @@ +#!/bin/bash + +# Unsupervised set: train_unsup100k_500k +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 3,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=80 +exp=exp/semisup_100k + +supervised_set=train_sup +unsupervised_set=train_unsup100k_500k +semisup_train_set= # semisup100k_500k + +tdnn_affix=7d_h1024 # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" +tree_affix=bi_c + +nnet3_affix= # affix for nnet3 and chain dir -- relates to i-vector used + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg=150 # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +phone_insertion_penalty= + +smbr_leaky_hmm_coefficient=0.00001 +mmi_factor_schedule="output-0=1.0,1.0 output-1=0.2,0.2" +smbr_factor_schedule="output-0=0.0,0.0 output-1=0.4,0.4" + +# Semi-supervised options +comb_affix=comb_500k_1c_smbr # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +chain_smbr_extra_opts="--one-silence-class" +lm_weights=3,1 +num_copies=2,1 +sup_egs_dir= +unsup_egs_dir= +unsup_egs_opts= + +remove_egs=false +common_egs_dir= + +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +apply_deriv_weights=true +use_smart_splitting=true + +# training options +num_epochs=2 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +self_repair_scale=0.00001 +label_delay=5 + +# decode options +extra_left_context=50 +extra_right_context=0 + +decode_iter= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +if $use_smart_splitting; then + comb_affix=${comb_affix:+${comb_affix}_smart} +else + comb_affix=${comb_affix:+${comb_affix}_naive} +fi + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}_sp${decode_affix}/frame_subsampling_factor +fi + +cmvn_opts=`cat $chaindir/cmvn_opts` || exit 1 + +sup_ali_dir=$exp/tri4a + +treedir=$exp/chain${nnet3_affix}/tree_${tree_affix} +if [ ! -f $treedir/final.mdl ]; then + echo "$0: $treedir/final.mdl does not exist." + exit 1 +fi + +diff $treedir/tree $chaindir/tree || { echo "$0: $treedir/tree and $chaindir/tree differ"; exit 1; } + +dir=$exp/chain${nnet3_affix}/tdnn_lstm${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} + +#if [ $stage -le 9 ]; then +# steps/subset_ali_dir.sh --cmd "$train_cmd" \ +# data/${unsupervised_set} data/${unsupervised_set}_sp_hires \ +# $chaindir/best_path_${unsupervised_set}_sp${decode_affix} \ +# $chaindir/best_path_${unsupervised_set}${decode_affix} +# echo $frame_subsampling_factor > $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +#fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}_sp${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm4 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + output-layer name=output input=lstm4 output-delay=$label_delay dim=$num_targets include-log-softmax=false max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine@$label_delay + output name=output-1 input=output.affine@$label_delay + + output name=output-0-xent input=output-xent.log-softmax@$label_delay + output name=output-1-xent input=output-xent.log-softmax@$label_delay +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + chunk_left_context] +right_context=$[model_right_context + chunk_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +egs_left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +egs_right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +egs_left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +egs_right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri4a_${supervised_set}_unk_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} + +if [ -z "$unsup_egs_dir" ]; then + [ -z $unsup_frames_per_eg ] && [ ! -z "$frames_per_eg" ] && unsup_frames_per_eg=$frames_per_eg + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + if $use_smart_splitting; then + get_egs_script=steps/nnet3/chain/get_egs_split.sh + else + get_egs_script=steps/nnet3/chain/get_egs.sh + fi + + $get_egs_script --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/chain/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --block-size 128 \ + --lang2weight $supervision_weights --lang2num-copies "$num_copies" \ + 2 $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --chain.smbr-leaky-hmm-coefficient $smbr_leaky_hmm_coefficient \ + --chain.mmi-factor-schedule="$mmi_factor_schedule" \ + --chain.smbr-factor-schedule="$smbr_factor_schedule" \ + --chain.smbr-extra-opts="$chain_smbr_extra_opts" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir --lang data/lang_chain_unk || exit 1; +fi + +graph_dir=$dir/graph${test_graph_affix} +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 $test_lang $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --frames-per-chunk 160 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + $graph_dir data/${decode_set}_hires $dir/decode${test_graph_affix}_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi + +wait; +exit 0; diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_d.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_d.sh new file mode 100755 index 00000000000..265a8c05a11 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_d.sh @@ -0,0 +1,228 @@ +#!/bin/bash +set -e + +# This is fisher chain recipe for training a model on a subset of around 15 hours. + +# configs for 'chain' +stage=0 +tdnn_affix=7d +train_stage=-10 +get_egs_stage=-10 +decode_iter= +train_set=train_sup +ivector_train_set=train_sup +tree_affix=bi_c +nnet3_affix= +chain_affix= +exp=exp/semisup_100k +gmm=tri4a +hidden_dim=512 +cell_dim=512 +projection_dim=128 + +# training options +num_epochs=4 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +label_delay=5 + +# decode options +extra_left_context=50 +extra_right_context=0 + +remove_egs=false +common_egs_dir= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate -1 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 7000 data/${train_set}_sp $lang $lat_dir $treedir || exit 1 +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm4 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm4 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + mkdir -p $dir/egs + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true" \ + --egs.chunk-width 160,140,110,80 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph_poco_unk +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_poco_test_unk $dir $graph_dir +fi + +decode_suff= +if [ $stage -le 15 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk 160 \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}${decode_suff} || exit 1; + ) & + done +fi +wait; +exit 0; diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_e.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_e.sh new file mode 100755 index 00000000000..05fe3a017e3 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_e.sh @@ -0,0 +1,228 @@ +#!/bin/bash +set -e + +# This is fisher chain recipe for training a model on a subset of around 15 hours. + +# configs for 'chain' +stage=0 +tdnn_affix=7e +train_stage=-10 +get_egs_stage=-10 +decode_iter= +train_set=train_sup +ivector_train_set=train_sup +tree_affix=bi_e +nnet3_affix= +chain_affix= +exp=exp/semisup_100k +gmm=tri4a +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +# training options +num_epochs=4 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +label_delay=5 + +# decode options +extra_left_context=50 +extra_right_context=0 + +remove_egs=false +common_egs_dir= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate -1 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 7000 data/${train_set}_sp $lang $lat_dir $treedir || exit 1 +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm4 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm4 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + mkdir -p $dir/egs + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true" \ + --egs.chunk-width 160,140,110,80 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph_poco +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_poco_test $dir $graph_dir +fi + +decode_suff= +if [ $stage -le 15 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk 160 \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}${decode_suff} || exit 1; + ) & + done +fi +wait; +exit 0; diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_f.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_f.sh new file mode 100755 index 00000000000..9bc98d90934 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_f.sh @@ -0,0 +1,228 @@ +#!/bin/bash +set -e + +# This is fisher chain recipe for training a model on a subset of around 15 hours. + +# configs for 'chain' +stage=0 +tdnn_affix=7f +train_stage=-10 +get_egs_stage=-10 +decode_iter= +train_set=train_sup +ivector_train_set=train_sup +tree_affix=bi_e +nnet3_affix= +chain_affix= +exp=exp/semisup_100k +gmm=tri4a +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +# training options +num_epochs=4 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +label_delay=5 + +# decode options +extra_left_context=50 +extra_right_context=0 + +remove_egs=false +common_egs_dir= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate -1 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 7000 data/${train_set}_sp $lang $lat_dir $treedir || exit 1 +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm4 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm4 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + mkdir -p $dir/egs + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true --constrained false" \ + --egs.chunk-width 160,140,110,80 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph_poco +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_poco_test $dir $graph_dir +fi + +decode_suff= +if [ $stage -le 15 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk 160 \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}${decode_suff} || exit 1; + ) & + done +fi +wait; +exit 0; diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_g.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_g.sh new file mode 100755 index 00000000000..ff4e8d55efc --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_g.sh @@ -0,0 +1,228 @@ +#!/bin/bash +set -e + +# This is fisher chain recipe for training a model on a subset of around 15 hours. + +# configs for 'chain' +stage=0 +tdnn_affix=7g +train_stage=-10 +get_egs_stage=-10 +decode_iter= +train_set=train_sup +ivector_train_set=train_sup +tree_affix=bi_e +nnet3_affix= +chain_affix= +exp=exp/semisup_100k +gmm=tri4a +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +# training options +num_epochs=4 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +label_delay=5 + +# decode options +extra_left_context=50 +extra_right_context=0 + +remove_egs=false +common_egs_dir= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate -1 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 7000 data/${train_set}_sp $lang $lat_dir $treedir || exit 1 +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm4 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm4 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + mkdir -p $dir/egs + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true --constrained false" \ + --egs.chunk-width 160,140,110,80 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph_poco_unk +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_poco_test_unk $dir $graph_dir +fi + +decode_suff= +if [ $stage -le 15 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk 160 \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}${decode_suff} || exit 1; + ) & + done +fi +wait; +exit 0; diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_oracle_d.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_oracle_d.sh new file mode 100755 index 00000000000..876633fedd6 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_oracle_d.sh @@ -0,0 +1,247 @@ +#!/bin/bash +set -e +set -u + +# This is oracle experiment for semi-supervised training with 100 hours +# of supervised data and 250 hours of unsupervised data + +# configs for 'chain' +stage=0 +tdnn_affix=7d_oracle +train_stage=-10 +get_egs_stage=-10 +decode_iter= +supervised_set=train_sup +unsupervised_set=train_unsup100k_250k_n10k +base_train_set=train_oracle100k_250k_n10k +tree_affix=bi_a +nnet3_affix= +chain_affix= +exp=exp/semisup_100k +gmm=tri4a + +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +# training options +num_epochs=4 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +self_repair_scale=0.00001 +label_delay=5 + +# decode options +extra_left_context=50 +extra_right_context=0 + +remove_egs=false +common_egs_dir= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ ! -f $treedir/final.mdl ]; then + echo "$0: Could not find $treedir/final.mdl" + exit 1 +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm4 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm4 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + mkdir -p $dir/egs + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true" \ + --egs.chunk-width 160,140,110,80 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph_poco +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_poco_test $dir $graph_dir +fi + +decode_suff= +if [ $stage -le 15 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk 160 \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}${decode_suff} || exit 1; + ) & + done +fi +wait; +exit 0; diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_smbr_a.sh.orig b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_smbr_a.sh.orig new file mode 100755 index 00000000000..a7505376a19 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_smbr_a.sh.orig @@ -0,0 +1,240 @@ +#!/bin/bash +set -e + +# This is fisher chain recipe for training a model on a subset of around 15 hours. + +# configs for 'chain' +stage=0 +tdnn_affix=7smbr_a +train_stage=-10 +get_egs_stage=-10 +decode_iter= +train_set=train_sup +ivector_train_set=train_sup +tree_affix=bi_c +nnet3_affix= +chain_affix= +exp=exp/semisup_100k +gmm=tri4a +hidden_dim=512 +cell_dim=512 +projection_dim=128 + +# training options +num_epochs=4 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +label_delay=5 +extra_opts="--chain.mmi-factor-schedule=1.0,1.0@0.1,0.5@0.2,0.5 --chain.smbr-factor-schedule=0.0,0.0@0.1,0.5@0.2,0.5" +chain_smbr_extra_opts= +smbr_leaky_hmm_coefficient=0.00001 +leaky_hmm_coefficient=0.1 +<<<<<<< Updated upstream +l2_regularize=0.0 # 00005 +======= +>>>>>>> Stashed changes + +# decode options +extra_left_context=50 +extra_right_context=0 + +remove_egs=false +common_egs_dir= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate -1 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 7000 data/${train_set}_sp $lang $lat_dir $treedir || exit 1 +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + mkdir -p $dir/egs + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient $leaky_hmm_coefficient \ + --chain.smbr-leaky-hmm-coefficient $smbr_leaky_hmm_coefficient \ +<<<<<<< Updated upstream + --chain.l2-regularize $l2_regularize \ +======= + --chain.l2-regularize 0.0 \ +>>>>>>> Stashed changes + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true" \ + --egs.chunk-width 160,140,110,80 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --chain.smbr-extra-opts="$chain_smbr_extra_opts" \ + --cleanup.preserve-model-interval 10 \ + --dir $dir --lang $lang $extra_opts || exit 1; +fi + +graph_dir=$dir/graph_poco_unk +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_poco_test_unk $dir $graph_dir +fi + +decode_suff= +if [ $stage -le 15 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk 160 \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}${decode_suff} || exit 1; + ) & + done +fi +wait; +exit 0; From 5b0af2e9b73a5acf62a0839813827c2586bdede0 Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Tue, 1 May 2018 14:30:15 -0400 Subject: [PATCH 152/174] Minor fix --- egs/wsj/s5/steps/nnet3/chain/get_egs_split.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/egs/wsj/s5/steps/nnet3/chain/get_egs_split.sh b/egs/wsj/s5/steps/nnet3/chain/get_egs_split.sh index 33c3cb50e79..1f386e0c7a8 100755 --- a/egs/wsj/s5/steps/nnet3/chain/get_egs_split.sh +++ b/egs/wsj/s5/steps/nnet3/chain/get_egs_split.sh @@ -339,7 +339,7 @@ if [ ! -z "$kl_latdir" ]; then exit 1 fi - graph_posterior_rspecifier=scp:$dir/numerator_post.scp + graph_posterior_rspecifier="scp:$dir/numerator_post.scp" fi if [ $stage -le 2 ]; then @@ -358,7 +358,7 @@ if [ $stage -le 2 ]; then utils/filter_scp.pl $dir/valid_uttlist $dir/lat_special.scp \| \ lattice-align-phones --write-compact=false --replace-output-symbols=true $latdir/final.mdl scp:- ark:- \| \ nnet3-chain-split-and-get-egs $chain_supervision_all_opts $ivector_opts --srand=$srand \ - ${graph_posterior_rspecifier:+--graph-posterior-rspecifier=$graph_posterior_rspecifier} \ + ${graph_posterior_rspecifier:+--graph-posterior-rspecifier="$graph_posterior_rspecifier"} \ $egs_opts $chaindir/normalization.fst \ "$valid_feats" $chaindir/tree $chaindir/0.trans_mdl \ ark,s,cs:- "ark:$dir/valid_all.cegs" || exit 1 & @@ -366,7 +366,7 @@ if [ $stage -le 2 ]; then utils/filter_scp.pl $dir/train_subset_uttlist $dir/lat_special.scp \| \ lattice-align-phones --write-compact=false --replace-output-symbols=true $latdir/final.mdl scp:- ark:- \| \ nnet3-chain-split-and-get-egs $chain_supervision_all_opts $ivector_opts --srand=$srand \ - ${graph_posterior_rspecifier:+--graph-posterior-rspecifier=$graph_posterior_rspecifier} \ + ${graph_posterior_rspecifier:+--graph-posterior-rspecifier="$graph_posterior_rspecifier"} \ $egs_opts $chaindir/normalization.fst \ "$train_subset_feats" $chaindir/tree $chaindir/0.trans_mdl \ ark,s,cs:- "ark:$dir/train_subset_all.cegs" || exit 1 & From ed30167454f36d3ec1f75cbdd8a391ebd74b7a80 Mon Sep 17 00:00:00 2001 From: Cloud User Date: Wed, 2 May 2018 23:02:46 +0000 Subject: [PATCH 153/174] Add subsplit support --- egs/wsj/s5/steps/nnet3/decode_semisup.sh | 102 +++++++++++++++--- src/latbin/Makefile | 3 +- ...ttice-determinize-phone-pruned-parallel.cc | 60 ++++++++--- 3 files changed, 138 insertions(+), 27 deletions(-) diff --git a/egs/wsj/s5/steps/nnet3/decode_semisup.sh b/egs/wsj/s5/steps/nnet3/decode_semisup.sh index 3916b0edb1c..b4ab65807c3 100755 --- a/egs/wsj/s5/steps/nnet3/decode_semisup.sh +++ b/egs/wsj/s5/steps/nnet3/decode_semisup.sh @@ -8,6 +8,8 @@ # Begin configuration section. stage=1 nj=4 # number of decoding jobs. +sub_split=1 +keep_subsplit=false acwt=0.1 # Just a default value, used for adaptation and beam-pruning.. post_decode_acwt=1.0 # can be used in 'chain' systems to scale acoustics by 10 so the # regular scoring script works. @@ -108,7 +110,7 @@ extra_opts= lat_wspecifier="ark:|" if ! $write_compact; then extra_opts="--determinize-lattice=false" - lat_wspecifier="ark:| lattice-determinize-phone-pruned --beam=$lattice_beam --acoustic-scale=$acwt --minimize=$minimize --word-determinize=$word_determinize --write-compact=false $model ark:- ark:- |" + lat_wspecifier="ark:| lattice-determinize-phone-pruned-parallel --num-threads=$num_threads --beam=$lattice_beam --acoustic-scale=$acwt --minimize=$minimize --word-determinize=$word_determinize --write-compact=false $model ark:- ark:- |" fi if [ "$post_decode_acwt" == 1.0 ]; then @@ -123,19 +125,93 @@ if [ -f $srcdir/frame_subsampling_factor ]; then frame_subsampling_opt="--frame-subsampling-factor=$(cat $srcdir/frame_subsampling_factor)" fi +# if this job is interrupted by the user, we want any background jobs to be +# killed too. +cleanup() { + local pids=$(jobs -pr) + [ -n "$pids" ] && kill $pids +} +trap "cleanup" INT QUIT TERM EXIT + + if [ $stage -le 1 ]; then - $cmd --num-threads $num_threads JOB=1:$nj $dir/log/decode.JOB.log \ - nnet3-latgen-faster$thread_string $ivector_opts $frame_subsampling_opt \ - --frames-per-chunk=$frames_per_chunk \ - --extra-left-context=$extra_left_context \ - --extra-right-context=$extra_right_context \ - --extra-left-context-initial=$extra_left_context_initial \ - --extra-right-context-final=$extra_right_context_final \ - --minimize=$minimize --word-determinize=$word_determinize \ - --max-active=$max_active --min-active=$min_active --beam=$beam \ - --lattice-beam=$lattice_beam --acoustic-scale=$acwt --allow-partial=true \ - --word-symbol-table=$graphdir/words.txt ${extra_opts} "$model" \ - $graphdir/HCLG.fst "$feats" "$lat_wspecifier" || exit 1; + if [ $sub_split -eq 1 ]; then + $cmd --num-threads $num_threads JOB=1:$nj $dir/log/decode.JOB.log \ + nnet3-latgen-faster$thread_string $ivector_opts $frame_subsampling_opt \ + --frames-per-chunk=$frames_per_chunk \ + --extra-left-context=$extra_left_context \ + --extra-right-context=$extra_right_context \ + --extra-left-context-initial=$extra_left_context_initial \ + --extra-right-context-final=$extra_right_context_final \ + --minimize=$minimize --word-determinize=$word_determinize \ + --max-active=$max_active --min-active=$min_active --beam=$beam \ + --lattice-beam=$lattice_beam --acoustic-scale=$acwt --allow-partial=true \ + --word-symbol-table=$graphdir/words.txt ${extra_opts} $model \ + $graphdir/HCLG.fst "$feats" "$lat_wspecifier" || exit 1; + else + # each job from 1 to $nj is split into multiple pieces (sub-split), and we aim + # to have at most two jobs running at each time. The idea is that if we have + # stragglers from one job, we can be processing another one at the same time. + rm $dir/.error 2>/dev/null + + prev_pid= + for n in $(seq $[nj+1]); do + lat_subset_wspecifier="ark:|" + if ! $write_compact; then + lat_subset_wspecifier="ark:| lattice-determinize-phone-pruned-parallel --num-threads=$num_threads --beam=$lattice_beam --acoustic-scale=$acwt --minimize=$minimize --word-determinize=$word_determinize --write-compact=false $model ark:- ark:- |" + fi + if [ "$post_decode_acwt" == 1.0 ]; then + lat_subset_wspecifier="$lat_subset_wspecifier gzip -c >$dir/lat.$n.JOB.gz" + else + lat_subset_wspecifier="$lat_subset_wspecifier lattice-scale --acoustic-scale=$post_decode_acwt --write-compact=$write_compact ark:- ark:- | gzip -c >$dir/lat.$n.JOB.gz" + fi + + if [ $n -gt $nj ]; then + this_pid= + elif [ -f $dir/.done.$n ] && [ $dir/.done.$n -nt $model ]; then + echo "$0: Not processing subset $n as already done (delete $dir/.done.$n if not)"; + this_pid= + else + sdata2=$data/split$nj/$n/split${sub_split}utt; + utils/split_data.sh --per-utt $sdata/$n $sub_split || exit 1; + mkdir -p $dir/log/$n + mkdir -p $dir/part + feats_subset=$(echo $feats | sed s:JOB/:$n/split${sub_split}utt/JOB/:g) + $cmd --num-threads $num_threads JOB=1:$sub_split $dir/log/$n/decode.JOB.log \ + nnet3-latgen-faster$thread_string $ivector_opts $frame_subsampling_opt \ + --frames-per-chunk=$frames_per_chunk \ + --extra-left-context=$extra_left_context \ + --extra-right-context=$extra_right_context \ + --extra-left-context-initial=$extra_left_context_initial \ + --extra-right-context-final=$extra_right_context_final \ + --minimize=$minimize --word-determinize=$word_determinize \ + --max-active=$max_active --min-active=$min_active --beam=$beam \ + --lattice-beam=$lattice_beam --acoustic-scale=$acwt --allow-partial=true \ + --word-symbol-table=$graphdir/words.txt ${extra_opts} $model \ + $graphdir/HCLG.fst "$feats_subset" "$lat_subset_wspecifier" || touch $dir/.error & + this_pid=$! + fi + if [ ! -z "$prev_pid" ]; then # Wait for the previous job to merge lattices. + wait $prev_pid + [ -f $dir/.error ] && \ + echo "$0: error generating lattices" && exit 1; + + if ! $keep_subsplit; then + rm $dir/.merge_error 2>/dev/null + echo "$0: Merging archives for data subset $prev_n" + for k in $(seq $sub_split); do + gunzip -c $dir/lat.$prev_n.$k.gz || touch $dir/.merge_error; + done | gzip -c > $dir/lat.$prev_n.gz || touch $dir/.merge_error; + [ -f $dir/.merge_error ] && \ + echo "$0: Merging lattices for subset $prev_n failed" && exit 1; + rm $dir/lat.$prev_n.*.gz + fi + touch $dir/.done.$prev_n + fi + prev_n=$n + prev_pid=$this_pid + done + fi fi diff --git a/src/latbin/Makefile b/src/latbin/Makefile index 5248710c6c8..bcffbb43168 100644 --- a/src/latbin/Makefile +++ b/src/latbin/Makefile @@ -25,8 +25,7 @@ BINFILES = lattice-best-path lattice-prune lattice-equivalent lattice-to-nbest \ lattice-determinize-phone-pruned-parallel lattice-expand-ngram \ lattice-lmrescore-const-arpa lattice-lmrescore-rnnlm nbest-to-prons \ lattice-arc-post lattice-determinize-non-compact lattice-lmrescore-kaldi-rnnlm \ - lattice-lmrescore-pruned lattice-lmrescore-kaldi-rnnlm-pruned \ - lattice-determinize-phone-pruned-non-compact lattice-top-sort + lattice-lmrescore-pruned lattice-lmrescore-kaldi-rnnlm-pruned OBJFILES = diff --git a/src/latbin/lattice-determinize-phone-pruned-parallel.cc b/src/latbin/lattice-determinize-phone-pruned-parallel.cc index 6d273d433c6..fa29e7dc8d3 100644 --- a/src/latbin/lattice-determinize-phone-pruned-parallel.cc +++ b/src/latbin/lattice-determinize-phone-pruned-parallel.cc @@ -38,12 +38,20 @@ class DeterminizeLatticeTask { BaseFloat beam, Lattice *lat, CompactLatticeWriter *clat_writer, + LatticeWriter *lat_writer, int32 *num_warn): trans_model_(&trans_model), opts_(opts), key_(key), acoustic_scale_(acoustic_scale), beam_(beam), - lat_(lat), clat_writer_(clat_writer), num_warn_(num_warn) { } + lat_(lat), clat_writer_(clat_writer), + lat_writer_(lat_writer), num_warn_(num_warn) { + KALDI_ASSERT((lat_writer_ && !clat_writer_) || + (!lat_writer_ && clat_writer_)); + } void operator () () { + if (lat_writer_) + ComputeAcousticScoresMap(*lat_, &acoustic_scores_); + // We apply the acoustic scale before determinization and will undo it // afterward, since it can affect the result. fst::ScaleLattice(fst::AcousticLatticeScale(acoustic_scale_), lat_); @@ -57,16 +65,28 @@ class DeterminizeLatticeTask { delete lat_; lat_ = NULL; - - // Invert the original acoustic scaling - fst::ScaleLattice(fst::AcousticLatticeScale(1.0/acoustic_scale_), - &det_clat_); } ~DeterminizeLatticeTask() { - KALDI_VLOG(2) << "Wrote lattice with " << det_clat_.NumStates() - << " for key " << key_; - clat_writer_->Write(key_, det_clat_); + if (clat_writer_) { + KALDI_VLOG(2) << "Wrote lattice with " << det_clat_.NumStates() + << " for key " << key_; + // Invert the original acoustic scaling + fst::ScaleLattice(fst::AcousticLatticeScale(1.0/acoustic_scale_), + &det_clat_); + clat_writer_->Write(key_, det_clat_); + } else { + KALDI_VLOG(2) << "Wrote lattice with " << det_clat_.NumStates() + << " for key " << key_; + Lattice out_lat; + fst::ConvertLattice(det_clat_, &out_lat); + + // Replace each arc (t, tid) with the averaged acoustic score from + // the computed map + ReplaceAcousticScoresFromMap(acoustic_scores_, &out_lat); + + lat_writer_->Write(key_, out_lat); + } } private: const TransitionModel *trans_model_; @@ -80,8 +100,12 @@ class DeterminizeLatticeTask { // destructor. CompactLattice det_clat_; CompactLatticeWriter *clat_writer_; + LatticeWriter *lat_writer_; int32 *num_warn_; - + + // Used to compute a map from each (t, tid) to (sum_of_acoustic_scores, count) + unordered_map, std::pair, + PairHasher > acoustic_scores_; }; } // namespace kaldi @@ -107,6 +131,7 @@ int main(int argc, char *argv[]) { " --acoustic-scale=0.1 final.mdl ark:in.lats ark:det.lats\n"; ParseOptions po(usage); + bool write_compact = true; BaseFloat acoustic_scale = 1.0; BaseFloat beam = 10.0; @@ -114,6 +139,12 @@ int main(int argc, char *argv[]) { fst::DeterminizeLatticePhonePrunedOptions determinize_opts; determinize_opts.max_mem = 50000000; + po.Register("write-compact", &write_compact, + "If true, write in normal (compact) form. " + "--write-compact=false allows you to retain frame-level " + "acoustic score information, but this requires the input " + "to be in non-compact form e.g. undeterminized lattice " + "straight from decoding."); po.Register("acoustic-scale", &acoustic_scale, "Scaling factor for acoustic" " likelihoods."); po.Register("beam", &beam, "Pruning beam [applied after acoustic scaling]."); @@ -137,8 +168,13 @@ int main(int argc, char *argv[]) { // accepts. SequentialLatticeReader lat_reader(lats_rspecifier); - // Writes as compact lattice. - CompactLatticeWriter compact_lat_writer(lats_wspecifier); + CompactLatticeWriter *compact_lat_writer = NULL; + LatticeWriter *lat_writer = NULL; + + if (write_compact) + compact_lat_writer = new CompactLatticeWriter(lats_wspecifier); + else + lat_writer = new LatticeWriter(lats_wspecifier); TaskSequencer sequencer(sequencer_opts); @@ -157,7 +193,7 @@ int main(int argc, char *argv[]) { DeterminizeLatticeTask *task = new DeterminizeLatticeTask( trans_model, determinize_opts, key, acoustic_scale, beam, - lat, &compact_lat_writer, &n_warn); + lat, compact_lat_writer, lat_writer, &n_warn); sequencer.Run(task); n_done++; From f2d3994ec8fd7c191cabf835ae16b1bd79feb36a Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Thu, 3 May 2018 16:00:17 -0400 Subject: [PATCH 154/174] Add sub_split option to rescoring --- .../lmrescore_const_arpa_undeterminized.sh | 94 ++++++++++++++++--- egs/wsj/s5/steps/nnet3/decode_semisup.sh | 4 +- src/chain/chain-supervision-splitter.cc | 3 + 3 files changed, 85 insertions(+), 16 deletions(-) diff --git a/egs/wsj/s5/steps/lmrescore_const_arpa_undeterminized.sh b/egs/wsj/s5/steps/lmrescore_const_arpa_undeterminized.sh index a075b8debe8..933aa9f7cdd 100755 --- a/egs/wsj/s5/steps/lmrescore_const_arpa_undeterminized.sh +++ b/egs/wsj/s5/steps/lmrescore_const_arpa_undeterminized.sh @@ -21,6 +21,7 @@ # Begin configuration section. cmd=run.pl +keep_subsplit=false skip_scoring=false stage=1 scoring_opts= @@ -73,25 +74,88 @@ fi oldlmcommand="fstproject --project_output=true $oldlm |" mkdir -p $outdir/log -nj=`cat $indir/num_jobs` || exit 1; +nj=$(cat $indir/num_jobs) || exit 1; cp $indir/num_jobs $outdir -lats_rspecifier="ark:gunzip -c $indir/lat.JOB.gz |" - -lats_wspecifier="ark:| gzip -c > $outdir/lat.JOB.gz" +sub_split=1 +if [ -f $indir/sub_split ]; then + sub_split=$(cat $indir/sub_split) || exit 1 +fi if [ $stage -le 1 ]; then - $cmd JOB=1:$nj $outdir/log/rescorelm.JOB.log \ - lattice-determinize-pruned --acoustic-scale=$acwt --beam=$beam \ - "ark:gunzip -c $indir/lat.JOB.gz |" ark:- \| \ - lattice-scale --lm-scale=0.0 --acoustic-scale=0.0 ark:- ark:- \| \ - lattice-lmrescore --lm-scale=-1.0 ark:- "$oldlmcommand" ark:- \| \ - lattice-lmrescore-const-arpa --lm-scale=1.0 \ - ark:- "$newlm" ark:- \| \ - lattice-project ark:- ark:- \| \ - lattice-compose --write-compact=$write_compact \ - "$lats_rspecifier" \ - ark,s,cs:- "$lats_wspecifier" || exit 1 + if [ $sub_split -eq 1 ]; then + lats_rspecifier="ark:gunzip -c $indir/lat.JOB.gz |" + lats_wspecifier="ark:| gzip -c > $outdir/lat.JOB.gz" + + $cmd JOB=1:$nj $outdir/log/rescorelm.JOB.log \ + lattice-determinize-pruned --acoustic-scale=$acwt --beam=$beam \ + "ark:gunzip -c $indir/lat.JOB.gz |" ark:- \| \ + lattice-scale --lm-scale=0.0 --acoustic-scale=0.0 ark:- ark:- \| \ + lattice-lmrescore --lm-scale=-1.0 ark:- "$oldlmcommand" ark:- \| \ + lattice-lmrescore-const-arpa --lm-scale=1.0 \ + ark:- "$newlm" ark:- \| \ + lattice-project ark:- ark:- \| \ + lattice-compose --write-compact=$write_compact \ + "$lats_rspecifier" \ + ark,s,cs:- "$lats_wspecifier" || exit 1 + else + # each job from 1 to $nj is split into multiple pieces (sub-split), and we aim + # to have at most two jobs running at each time. The idea is that if we have + # stragglers from one job, we can be processing another one at the same time. + rm $dir/.error 2>/dev/null + + prev_pid= + for n in $(seq $[nj+1]); do + lats_rspecifier="ark:gunzip -c $indir/lat.$n.JOB.gz |" + lats_wspecifier="ark:| gzip -c > $outdir/lat.$n.JOB.gz" + + if [ $n -gt $nj ]; then + this_pid= + elif [ -f $dir/.done.$n ] && [ $dir/.done.$n -nt $model ]; then + echo "$0: Not processing subset $n as already done (delete $dir/.done.$n if not)"; + this_pid= + else + mkdir -p $dir/log/$n + mkdir -p $dir/part + + $cmd JOB=1:$nj $outdir/log/rescorelm.JOB.log \ + lattice-determinize-pruned --acoustic-scale=$acwt --beam=$beam \ + "ark:gunzip -c $indir/lat.JOB.gz |" ark:- \| \ + lattice-scale --lm-scale=0.0 --acoustic-scale=0.0 ark:- ark:- \| \ + lattice-lmrescore --lm-scale=-1.0 ark:- "$oldlmcommand" ark:- \| \ + lattice-lmrescore-const-arpa --lm-scale=1.0 \ + ark:- "$newlm" ark:- \| \ + lattice-project ark:- ark:- \| \ + lattice-compose --write-compact=$write_compact \ + "$lats_rspecifier" \ + ark,s,cs:- "$lats_wspecifier" || touch $dir/.error & + this_pid=$! + fi + if [ ! -z "$prev_pid" ]; then # Wait for the previous job to merge lattices. + wait $prev_pid + [ -f $dir/.error ] && \ + echo "$0: error generating lattices" && exit 1; + + if ! $keep_subsplit; then + rm $dir/.merge_error 2>/dev/null + echo "$0: Merging archives for data subset $prev_n" + for k in $(seq $sub_split); do + gunzip -c $dir/lat.$prev_n.$k.gz || touch $dir/.merge_error; + done | gzip -c > $dir/lat.$prev_n.gz || touch $dir/.merge_error; + [ -f $dir/.merge_error ] && \ + echo "$0: Merging lattices for subset $prev_n failed" && exit 1; + rm $dir/lat.$prev_n.*.gz + fi + touch $dir/.done.$prev_n + fi + prev_n=$n + prev_pid=$this_pid + done + fi +fi + +if $keep_subsplit; then + echo $sub_split > $dir/sub_split fi if ! $skip_scoring && [ $stage -le 2 ]; then diff --git a/egs/wsj/s5/steps/nnet3/decode_semisup.sh b/egs/wsj/s5/steps/nnet3/decode_semisup.sh index b4ab65807c3..94f690358a1 100755 --- a/egs/wsj/s5/steps/nnet3/decode_semisup.sh +++ b/egs/wsj/s5/steps/nnet3/decode_semisup.sh @@ -133,7 +133,6 @@ cleanup() { } trap "cleanup" INT QUIT TERM EXIT - if [ $stage -le 1 ]; then if [ $sub_split -eq 1 ]; then $cmd --num-threads $num_threads JOB=1:$nj $dir/log/decode.JOB.log \ @@ -214,6 +213,9 @@ if [ $stage -le 1 ]; then fi fi +if $keep_subsplit; then + echo $sub_split > $dir/sub_split +fi if [ $stage -le 2 ]; then if ! $skip_diagnostics ; then diff --git a/src/chain/chain-supervision-splitter.cc b/src/chain/chain-supervision-splitter.cc index d40586e3783..a1c9597ca64 100644 --- a/src/chain/chain-supervision-splitter.cc +++ b/src/chain/chain-supervision-splitter.cc @@ -440,8 +440,11 @@ bool SupervisionLatticeSplitter::GetSupervision( KALDI_ASSERT(transition_id_fst.NumStates() > 0); if (opts_.convert_to_unconstrained) { + supervision->label_dim = trans_model_.NumTransitionIds(); std::swap(transition_id_fst, supervision->fst); return ConvertSupervisionToUnconstrained(trans_model_, supervision); + } else { + supervision->label_dim = trans_model_.NumPdfs(); } fst::TableComposeOptions compose_opts; From fcc156fbc98e7c9a1b60c601ab01939134cfd5ff Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Tue, 5 Jun 2018 16:26:15 -0400 Subject: [PATCH 155/174] Fixing smbr recipe --- ...nn_lstm_100k_250k_semisupervised_conf_f.sh | 18 ++-- ...nn_lstm_100k_250k_semisupervised_conf_g.sh | 24 ++--- ...tm_100k_250k_semisupervised_conf_smbr_b.sh | 93 ++----------------- ..._100k_500k_ex500k_semisupervised_smbr_a.sh | 8 +- ...nn_lstm_100k_500k_semisupervised_conf_d.sh | 31 +++---- ...tm_100k_500k_semisupervised_conf_smbr_b.sh | 20 ++-- ...tm_100k_500k_semisupervised_conf_smbr_c.sh | 2 +- ...run_tdnn_lstm_15k_semisupervised_conf_b.sh | 14 +-- src/chainbin/nnet3-chain-compute-prob.cc | 2 +- 9 files changed, 63 insertions(+), 149 deletions(-) diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_250k_semisupervised_conf_f.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_250k_semisupervised_conf_f.sh index 064cc03a00e..dfdc36d6428 100644 --- a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_250k_semisupervised_conf_f.sh +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_250k_semisupervised_conf_f.sh @@ -21,7 +21,7 @@ semisup_train_set= # semisup100k_250k tdnn_affix=7g # affix for the supervised chain-model directory train_supervised_opts="--stage -10 --train-stage -10" -tree_affix=bi_c +tree_affix=bi_e nnet3_affix= # affix for nnet3 and chain dir -- relates to i-vector used @@ -181,7 +181,7 @@ fi if [ $stage -le 8 ]; then steps/best_path_weights.sh --cmd "${train_cmd}" --acwt 0.1 \ - data/${unsupervised_set}_sp_hires $lang \ + data/${unsupervised_set}_sp_hires \ $chaindir/decode_${unsupervised_set}_sp${decode_affix} \ $chaindir/best_path_${unsupervised_set}_sp${decode_affix} echo $frame_subsampling_factor > $chaindir/best_path_${unsupervised_set}_sp${decode_affix}/frame_subsampling_factor @@ -262,11 +262,11 @@ if [ $stage -le 11 ]; then # similar in the xent and regular final layers. output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 - output name=output-0 input=output.affine@$label_delay skip-in-init=true - output name=output-1 input=output.affine@$label_delay skip-in-init=true + output name=output-0 input=output.affine@$label_delay + output name=output-1 input=output.affine@$label_delay - output name=output-0-xent input=output-xent.log-softmax@$label_delay skip-in-init=true - output name=output-1-xent input=output-xent.log-softmax@$label_delay skip-in-init=true + output name=output-0-xent input=output-xent.log-softmax@$label_delay + output name=output-1-xent input=output-xent.log-softmax@$label_delay EOF steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ @@ -358,9 +358,9 @@ fi comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi if [ $stage -le 14 ]; then - steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ - --minibatch-size 64 --frames-per-iter 1500000 \ - --lang2weight $supervision_weights --egs-prefix cegs. --lang2num-copies "$num_copies" \ + steps/nnet3/chain/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --block-size 64 \ + --lang2weight $supervision_weights --lang2num-copies "$num_copies" \ 2 $sup_egs_dir $unsup_egs_dir $comb_egs_dir touch $comb_egs_dir/.nodelete # keep egs around when that run dies. fi diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_250k_semisupervised_conf_g.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_250k_semisupervised_conf_g.sh index 70a62d8b152..9dcfb693eda 100644 --- a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_250k_semisupervised_conf_g.sh +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_250k_semisupervised_conf_g.sh @@ -21,7 +21,7 @@ semisup_train_set= # semisup100k_250k tdnn_affix=7f # affix for the supervised chain-model directory train_supervised_opts="--stage -10 --train-stage -10" -tree_affix=bi_c +tree_affix=bi_f nnet3_affix= # affix for nnet3 and chain dir -- relates to i-vector used @@ -121,7 +121,7 @@ graphdir=$chaindir/graph${unsup_decode_graph_affix} decode_affix=${decode_affix}${unsup_decode_graph_affix} -if [ ! -f $graphdir/HCLG.fst ]; then +if true || [ ! -f $graphdir/HCLG.fst ]; then utils/mkgraph.sh --self-loop-scale 1.0 $unsup_decode_lang $chaindir $graphdir fi @@ -164,7 +164,7 @@ for dset in $unsupervised_set; do steps/nnet3/decode_semisup.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \ --acwt 1.0 --post-decode-acwt 10.0 --write-compact false --skip-scoring true \ --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_sp_hires \ - --frames-per-chunk 160 \ + --frames-per-chunk 160 --sub-split 1 \ --extra-left-context $extra_left_context \ --extra-right-context $extra_right_context \ --extra-left-context-initial 0 --extra-right-context-final 0 \ @@ -181,7 +181,7 @@ fi if [ $stage -le 8 ]; then steps/best_path_weights.sh --cmd "${train_cmd}" --acwt 0.1 \ - data/${unsupervised_set}_sp_hires $lang \ + data/${unsupervised_set}_sp_hires \ $chaindir/decode_${unsupervised_set}_sp${decode_affix} \ $chaindir/best_path_${unsupervised_set}_sp${decode_affix} echo $frame_subsampling_factor > $chaindir/best_path_${unsupervised_set}_sp${decode_affix}/frame_subsampling_factor @@ -211,7 +211,7 @@ dir=$exp/chain${nnet3_affix}/tdnn_lstm${tdnn_affix}${decode_affix}${egs_affix}${ if [ $stage -le 10 ]; then steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ - ${treedir} ${chaindir}/best_path_${unsupervised_set}_sp${decode_affix} \ + $treedir ${chaindir}/best_path_${unsupervised_set}_sp${decode_affix} \ $dir fi @@ -262,11 +262,11 @@ if [ $stage -le 11 ]; then # similar in the xent and regular final layers. output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 - output name=output-0 input=output.affine@$label_delay skip-in-init=true - output name=output-1 input=output.affine@$label_delay skip-in-init=true + output name=output-0 input=output.affine@$label_delay + output name=output-1 input=output.affine@$label_delay - output name=output-0-xent input=output-xent.log-softmax@$label_delay skip-in-init=true - output name=output-1-xent input=output-xent.log-softmax@$label_delay skip-in-init=true + output name=output-0-xent input=output-xent.log-softmax@$label_delay + output name=output-1-xent input=output-xent.log-softmax@$label_delay EOF steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ @@ -358,9 +358,9 @@ fi comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi if [ $stage -le 14 ]; then - steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ - --minibatch-size 64 --frames-per-iter 1500000 \ - --lang2weight $supervision_weights --egs-prefix cegs. --lang2num-copies "$num_copies" \ + steps/nnet3/chain/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --block-size 64 \ + --lang2weight $supervision_weights --lang2num-copies "$num_copies" \ 2 $sup_egs_dir $unsup_egs_dir $comb_egs_dir touch $comb_egs_dir/.nodelete # keep egs around when that run dies. fi diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_250k_semisupervised_conf_smbr_b.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_250k_semisupervised_conf_smbr_b.sh index d5f49eda8e5..ce8bf87c87d 100644 --- a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_250k_semisupervised_conf_smbr_b.sh +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_250k_semisupervised_conf_smbr_b.sh @@ -197,13 +197,6 @@ diff $treedir/tree $chaindir/tree || { echo "$0: $treedir/tree and $chaindir/tre dir=$exp/chain${nnet3_affix}/tdnn_lstm${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} -#if [ $stage -le 9 ]; then -# steps/subset_ali_dir.sh --cmd "$train_cmd" \ -# data/${unsupervised_set} data/${unsupervised_set}_sp_hires \ -# $chaindir/best_path_${unsupervised_set}_sp${decode_affix} \ -# $chaindir/best_path_${unsupervised_set}${decode_affix} -# echo $frame_subsampling_factor > $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor -#fi if [ $stage -le 10 ]; then steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ @@ -258,11 +251,11 @@ if [ $stage -le 11 ]; then # similar in the xent and regular final layers. output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 - output name=output-0 input=output.affine@$label_delay skip-in-init=true - output name=output-1 input=output.affine@$label_delay skip-in-init=true + output name=output-0 input=output.affine@$label_delay + output name=output-1 input=output.affine@$label_delay - output name=output-0-xent input=output-xent.log-softmax@$label_delay skip-in-init=true - output name=output-1-xent input=output-xent.log-softmax@$label_delay skip-in-init=true + output name=output-0-xent input=output-xent.log-softmax@$label_delay + output name=output-1-xent input=output-xent.log-softmax@$label_delay EOF steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ @@ -354,9 +347,9 @@ fi comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi if [ $stage -le 14 ]; then - steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ - --minibatch-size 64 --frames-per-iter 1500000 \ - --lang2weight $supervision_weights --egs-prefix cegs. --lang2num-copies "$num_copies" \ + steps/nnet3/chain/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --block-size 64 \ + --lang2weight $supervision_weights --lang2num-copies "$num_copies" \ 2 $sup_egs_dir $unsup_egs_dir $comb_egs_dir touch $comb_egs_dir/.nodelete # keep egs around when that run dies. fi @@ -440,77 +433,5 @@ if [ $stage -le 18 ]; then done fi -if ! $do_finetuning; then - wait - exit 0 -fi - -if [ $stage -le 19 ]; then - mkdir -p ${dir}${finetune_suffix} - - for f in phone_lm.fst normalization.fst den.fst tree 0.trans_mdl cmvn_opts; do - cp ${dir}/$f ${dir}${finetune_suffix} || exit 1 - done - cp -r ${dir}/configs ${dir}${finetune_suffix} || exit 1 - - nnet3-copy --edits="remove-output-nodes name=output;remove-output-nodes name=output-xent;rename-node old-name=output-0 new-name=output;rename-node old-name=output-0-xent new-name=output-xent" \ - $dir/${finetune_iter}.mdl ${dir}${finetune_suffix}/init.raw - - if [ $finetune_stage -le -1 ]; then - finetune_stage=-1 - fi - - steps/nnet3/chain/train.py --stage $finetune_stage \ - --trainer.input-model ${dir}${finetune_suffix}/init.raw \ - --egs.dir "$sup_egs_dir" \ - --cmd "$decode_cmd" \ - --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ - --feat.cmvn-opts "--norm-means=false --norm-vars=false" $finetune_opts \ - --chain.xent-regularize $finetune_xent_regularize \ - --chain.leaky-hmm-coefficient 0.1 \ - --chain.l2-regularize 0.00005 \ - --chain.apply-deriv-weights true \ - --chain.lm-opts="--num-extra-lm-states=2000" \ - --egs.opts "--frames-overlap-per-eg 0" \ - --egs.chunk-width 150 \ - --egs.chunk-left-context $chunk_left_context \ - --egs.chunk-right-context $chunk_right_context \ - --egs.chunk-left-context-initial 0 \ - --egs.chunk-right-context-final 0 \ - --trainer.num-chunk-per-minibatch "150=64/300=32" \ - --trainer.frames-per-iter 1500000 \ - --trainer.num-epochs $num_epochs_finetune \ - --trainer.optimization.num-jobs-initial 3 \ - --trainer.optimization.num-jobs-final 16 \ - --trainer.optimization.initial-effective-lrate 0.0001 \ - --trainer.optimization.final-effective-lrate 0.00001 \ - --trainer.max-param-change 2.0 \ - --trainer.optimization.do-final-combination false \ - --cleanup.remove-egs false \ - --feat-dir data/${supervised_set}_hires \ - --tree-dir $treedir \ - --lat-dir $sup_lat_dir \ - --dir ${dir}${finetune_suffix} || exit 1; -fi - -dir=${dir}${finetune_suffix} - -if [ $stage -le 20 ]; then - for decode_set in dev test; do - ( - num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` - steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ - --nj $num_jobs --cmd "$decode_cmd" \ - --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ - --frames-per-chunk 150 \ - --extra-left-context $extra_left_context \ - --extra-right-context $extra_right_context \ - --extra-left-context-initial 0 --extra-right-context-final 0 \ - $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; - ) & - done -fi - wait; exit 0; - diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_500k_ex500k_semisupervised_smbr_a.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_500k_ex500k_semisupervised_smbr_a.sh index 1a4c6760f9a..ec5d2138730 100644 --- a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_500k_ex500k_semisupervised_smbr_a.sh +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_500k_ex500k_semisupervised_smbr_a.sh @@ -39,7 +39,7 @@ mmi_factor_schedule="output-0=1.0,1.0 output-1=0.2,0.2" smbr_factor_schedule="output-0=0.0,0.0 output-1=0.4,0.4" # Semi-supervised options -comb_affix=comb_500k_exp500k_1a_smbr # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +comb_affix=comb_500k_ex500k_1a_smbr # affix for new chain-model directory trained on the combined supervised+unsupervised subsets supervision_weights=1.0,1.0 chain_smbr_extra_opts="--one-silence-class" lm_weights=3,1 @@ -108,7 +108,7 @@ fi lang=data/lang_chain_unk unsup_decode_lang=data/lang_test_poco_ex500k_unk unsup_rescore_lang=data/lang_test_poco_ex500k_big_unk -unsup_decode_graph_affix=_poco_exp500k_unk +unsup_decode_graph_affix=_poco_ex500k_unk test_lang=data/lang_test_poco_unk test_graph_affix=_poco_unk @@ -145,7 +145,7 @@ for dset in $unsupervised_set; do steps/make_mfcc.sh --nj $decode_nj --cmd "$train_cmd" \ --mfcc-config conf/mfcc_hires.conf data/${dset}_sp_hires || exit 1 - steps/compute_cmvn_stats.shs data/${dset}_sp_hires + steps/compute_cmvn_stats.sh data/${dset}_sp_hires utils/fix_data_dir.sh data/${dset}_sp_hires fi @@ -370,7 +370,7 @@ comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi if [ $stage -le 14 ]; then steps/nnet3/chain/multilingual/combine_egs.sh --cmd "$train_cmd" \ - --block-size 64 \ + --block-size 128 \ --lang2weight $supervision_weights --lang2num-copies "$num_copies" \ 2 $sup_egs_dir $unsup_egs_dir $comb_egs_dir touch $comb_egs_dir/.nodelete # keep egs around when that run dies. diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_500k_semisupervised_conf_d.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_500k_semisupervised_conf_d.sh index 1ac9c770b3b..910cac4f1d0 100644 --- a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_500k_semisupervised_conf_d.sh +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_500k_semisupervised_conf_d.sh @@ -148,6 +148,8 @@ for dset in $unsupervised_set; do steps/make_mfcc.sh --nj $decode_nj --cmd "$train_cmd" \ --mfcc-config conf/mfcc_hires.conf data/${dset}_sp_hires || exit 1 + steps/compute_cmvn_stats.sh data/${dset}_sp_hires + utils/fix_data_dir.sh data/${dset}_sp_hires fi if [ $stage -le 4 ]; then @@ -161,14 +163,14 @@ for dset in $unsupervised_set; do if [ $stage -le 5 ]; then echo "$0: getting the decoding lattices for the unsupervised subset using the chain model at: $chaindir" - steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \ + steps/nnet3/decode_semisup.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \ --acwt 1.0 --post-decode-acwt 10.0 --write-compact false --skip-scoring true \ --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_sp_hires \ --frames-per-chunk 160 \ --extra-left-context $extra_left_context \ --extra-right-context $extra_right_context \ --extra-left-context-initial 0 --extra-right-context-final 0 \ - --scoring-opts "--min-lmwt 10 --max-lmwt 10" --determinize-opts "--word-determinize=false" \ + --scoring-opts "--min-lmwt 10 --max-lmwt 10" --word-determinize false \ $graphdir data/${dset}_sp_hires $chaindir/decode_${dset}_sp${decode_affix} fi ln -sf ../final.mdl $chaindir/decode_${dset}_sp${decode_affix}/ || true @@ -262,11 +264,11 @@ if [ $stage -le 11 ]; then # similar in the xent and regular final layers. output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 - output name=output-0 input=output.affine@$label_delay skip-in-init=true - output name=output-1 input=output.affine@$label_delay skip-in-init=true + output name=output-0 input=output.affine@$label_delay + output name=output-1 input=output.affine@$label_delay - output name=output-0-xent input=output-xent.log-softmax@$label_delay skip-in-init=true - output name=output-1-xent input=output-xent.log-softmax@$label_delay skip-in-init=true + output name=output-0-xent input=output-xent.log-softmax@$label_delay + output name=output-1-xent input=output-xent.log-softmax@$label_delay EOF steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ @@ -358,9 +360,9 @@ fi comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi if [ $stage -le 14 ]; then - steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ - --minibatch-size 64 --frames-per-iter 1500000 \ - --lang2weight $supervision_weights --egs-prefix cegs. --lang2num-copies "$num_copies" \ + steps/nnet3/chain/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --block-size 128 \ + --lang2weight $supervision_weights --lang2num-copies "$num_copies" \ 2 $sup_egs_dir $unsup_egs_dir $comb_egs_dir touch $comb_egs_dir/.nodelete # keep egs around when that run dies. fi @@ -414,16 +416,7 @@ if [ $stage -le 17 ]; then fi if [ $stage -le 18 ]; then - iter_opts= - if [ ! -z $decode_iter ]; then - nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ - nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 - iter_opts=" --iter ${decode_iter}-output " - else - nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ - nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 - iter_opts=" --iter final-output " - fi + iter_opts=${decode_iter:+--iter $decode_iter} for decode_set in dev test; do ( diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_500k_semisupervised_conf_smbr_b.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_500k_semisupervised_conf_smbr_b.sh index 25ddcb186b4..281a6e4d88d 100644 --- a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_500k_semisupervised_conf_smbr_b.sh +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_500k_semisupervised_conf_smbr_b.sh @@ -144,7 +144,7 @@ for dset in $unsupervised_set; do steps/make_mfcc.sh --nj $decode_nj --cmd "$train_cmd" \ --mfcc-config conf/mfcc_hires.conf data/${dset}_sp_hires || exit 1 - steps/compute_cmvn_stats.shs data/${dset}_sp_hires + steps/compute_cmvn_stats.sh data/${dset}_sp_hires utils/fix_data_dir.sh data/${dset}_sp_hires fi @@ -159,14 +159,14 @@ for dset in $unsupervised_set; do if [ $stage -le 5 ]; then echo "$0: getting the decoding lattices for the unsupervised subset using the chain model at: $chaindir" - steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \ + steps/nnet3/decode_semisup.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \ --acwt 1.0 --post-decode-acwt 10.0 --write-compact false --skip-scoring true \ --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_sp_hires \ --frames-per-chunk 160 \ --extra-left-context $extra_left_context \ --extra-right-context $extra_right_context \ --extra-left-context-initial 0 --extra-right-context-final 0 \ - --scoring-opts "--min-lmwt 10 --max-lmwt 10" --determinize-opts "--word-determinize=false" \ + --scoring-opts "--min-lmwt 10 --max-lmwt 10" --word-determinize false \ $graphdir data/${dset}_sp_hires $chaindir/decode_${dset}_sp${decode_affix} fi ln -sf ../final.mdl $chaindir/decode_${dset}_sp${decode_affix}/ || true @@ -260,11 +260,11 @@ if [ $stage -le 11 ]; then # similar in the xent and regular final layers. output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 - output name=output-0 input=output.affine@$label_delay skip-in-init=true - output name=output-1 input=output.affine@$label_delay skip-in-init=true + output name=output-0 input=output.affine@$label_delay + output name=output-1 input=output.affine@$label_delay - output name=output-0-xent input=output-xent.log-softmax@$label_delay skip-in-init=true - output name=output-1-xent input=output-xent.log-softmax@$label_delay skip-in-init=true + output name=output-0-xent input=output-xent.log-softmax@$label_delay + output name=output-1-xent input=output-xent.log-softmax@$label_delay EOF steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ @@ -356,9 +356,9 @@ fi comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi if [ $stage -le 14 ]; then - steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ - --minibatch-size 64 --frames-per-iter 1500000 \ - --lang2weight $supervision_weights --egs-prefix cegs. --lang2num-copies "$num_copies" \ + steps/nnet3/chain/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --block-size 128 \ + --lang2weight $supervision_weights --lang2num-copies "$num_copies" \ 2 $sup_egs_dir $unsup_egs_dir $comb_egs_dir touch $comb_egs_dir/.nodelete # keep egs around when that run dies. fi diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_500k_semisupervised_conf_smbr_c.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_500k_semisupervised_conf_smbr_c.sh index 66c89bb67d2..f29b65c6e7b 100644 --- a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_500k_semisupervised_conf_smbr_c.sh +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_100k_500k_semisupervised_conf_smbr_c.sh @@ -179,7 +179,7 @@ fi if [ $stage -le 8 ]; then steps/best_path_weights.sh --cmd "${train_cmd}" --acwt 0.1 \ - data/${unsupervised_set}_sp_hires $lang \ + data/${unsupervised_set}_sp_hires \ $chaindir/decode_${unsupervised_set}_sp${decode_affix} \ $chaindir/best_path_${unsupervised_set}_sp${decode_affix} echo $frame_subsampling_factor > $chaindir/best_path_${unsupervised_set}_sp${decode_affix}/frame_subsampling_factor diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_15k_semisupervised_conf_b.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_15k_semisupervised_conf_b.sh index e9749bd7676..681a46212c9 100644 --- a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_15k_semisupervised_conf_b.sh +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_lstm_15k_semisupervised_conf_b.sh @@ -255,11 +255,11 @@ if [ $stage -le 11 ]; then # similar in the xent and regular final layers. output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 - output name=output-0 input=output.affine@$label_delay skip-in-init=true - output name=output-1 input=output.affine@$label_delay skip-in-init=true + output name=output-0 input=output.affine@$label_delay + output name=output-1 input=output.affine@$label_delay - output name=output-0-xent input=output-xent.log-softmax@$label_delay skip-in-init=true - output name=output-1-xent input=output-xent.log-softmax@$label_delay skip-in-init=true + output name=output-0-xent input=output-xent.log-softmax@$label_delay + output name=output-1-xent input=output-xent.log-softmax@$label_delay EOF steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ @@ -351,9 +351,9 @@ fi comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi if [ $stage -le 14 ]; then - steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ - --minibatch-size 64 --frames-per-iter 1500000 \ - --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + steps/nnet3/chain/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --block-size 64 \ + --lang2weight $supervision_weights 2 \ $sup_egs_dir $unsup_egs_dir $comb_egs_dir touch $comb_egs_dir/.nodelete # keep egs around when that run dies. fi diff --git a/src/chainbin/nnet3-chain-compute-prob.cc b/src/chainbin/nnet3-chain-compute-prob.cc index 77a4e44f22d..01cce6d4165 100644 --- a/src/chainbin/nnet3-chain-compute-prob.cc +++ b/src/chainbin/nnet3-chain-compute-prob.cc @@ -86,7 +86,7 @@ int main(int argc, char *argv[]) { nnet_opts.compute_deriv = true; NnetChainComputeProb chain_prob_computer(nnet_opts, chain_opts, den_fst, - nnet); + nnet); SequentialNnetChainExampleReader example_reader(examples_rspecifier); From 9ab70b24f142c18691bb69b694af564b5e4a054b Mon Sep 17 00:00:00 2001 From: Cloud User Date: Thu, 7 Jun 2018 22:34:12 +0000 Subject: [PATCH 156/174] Adding semisup prep script for multi_en --- egs/multi_en/s5/local/make_mx6_calls.pl | 105 ++++ egs/multi_en/s5/local/make_mx6_mic.pl | 96 ++++ egs/multi_en/s5/local/make_sre.pl | 75 +++ egs/multi_en/s5/local/make_swbd2_phase1.pl | 106 ++++ egs/multi_en/s5/local/make_swbd2_phase2.pl | 107 ++++ egs/multi_en/s5/local/make_swbd2_phase3.pl | 102 ++++ egs/multi_en/s5/local/make_swbd_cellular1.pl | 83 +++ egs/multi_en/s5/local/make_swbd_cellular2.pl | 83 +++ .../chain/tuning/run_tdnn_semisupervised_a.sh | 473 ++++++++++++++++++ egs/multi_en/s5/local/semisup/run_mixer6.sh | 255 ++++++++++ egs/multi_en/s5/local/semisup/run_semisup.sh | 108 ++++ 11 files changed, 1593 insertions(+) create mode 100755 egs/multi_en/s5/local/make_mx6_calls.pl create mode 100755 egs/multi_en/s5/local/make_mx6_mic.pl create mode 100755 egs/multi_en/s5/local/make_sre.pl create mode 100755 egs/multi_en/s5/local/make_swbd2_phase1.pl create mode 100755 egs/multi_en/s5/local/make_swbd2_phase2.pl create mode 100755 egs/multi_en/s5/local/make_swbd2_phase3.pl create mode 100755 egs/multi_en/s5/local/make_swbd_cellular1.pl create mode 100755 egs/multi_en/s5/local/make_swbd_cellular2.pl create mode 100755 egs/multi_en/s5/local/semisup/chain/tuning/run_tdnn_semisupervised_a.sh create mode 100755 egs/multi_en/s5/local/semisup/run_mixer6.sh create mode 100644 egs/multi_en/s5/local/semisup/run_semisup.sh diff --git a/egs/multi_en/s5/local/make_mx6_calls.pl b/egs/multi_en/s5/local/make_mx6_calls.pl new file mode 100755 index 00000000000..ed9d6375248 --- /dev/null +++ b/egs/multi_en/s5/local/make_mx6_calls.pl @@ -0,0 +1,105 @@ +#!/usr/bin/perl +use warnings; #sed replacement for -w perl parameter +# Copyright 2017 David Snyder +# Apache 2.0 +# +# Prepares the telephone portion of Mixer 6 (LDC2013S03). + +if (@ARGV != 2) { + print STDERR "Usage: $0 \n"; + print STDERR "e.g. $0 /export/corpora5/LDC/LDC2013S03 data/\n"; + exit(1); +} +($db_base, $out_dir) = @ARGV; + +if (! -d "$db_base/mx6_speech/data/ulaw_sphere/") { + print STDERR "Directory $db_base/mx6_speech/data/ulaw_sphere/ doesn't exist\n"; + exit(1); +} + +$out_dir = "$out_dir/mx6_calls"; + +$tmp_dir = "$out_dir/tmp"; +if (system("mkdir -p $tmp_dir") != 0) { + die "Error making directory $tmp_dir"; +} + +if (system("mkdir -p $out_dir") != 0) { + print STDERR "Error making directory $out_dir\n"; + exit(1); +} + +%call2sph = (); +open(SUBJECTS, "<$db_base/mx6_speech/docs/mx6_subjs.csv") || die "cannot open $$db_base/mx6_speech/docs/mx6_subjs.csv"; +open(SPKR, ">$out_dir/utt2spk") || die "Could not open the output file $out_dir/utt2spk"; +open(GNDR, ">$out_dir/spk2gender") || die "Could not open the output file $out_dir/spk2gender"; +open(WAV, ">$out_dir/wav.scp") || die "Could not open the output file $out_dir/wav.scp"; +open(META, "<$db_base/mx6_speech/docs/mx6_calls.csv") || die "cannot open $db_base/mx6_speech/docs/mx6_calls.csv"; + +if (system("find $db_base/mx6_speech/data/ulaw_sphere/ -name '*.sph' > $tmp_dir/sph.list") != 0) { + die "Error getting list of sph files"; +} +open(SPHLIST, "<$tmp_dir/sph.list") or die "cannot open wav list"; + +while() { + chomp; + $sph = $_; + @toks = split("/",$sph); + $sph_id = (split("[./]",$toks[$#toks]))[0]; + $call_id = (split("_", $sph_id))[2]; + $call2sph[$call_id] = $sph; +} + +while () { + chomp; + $line = $_; + @toks = split(",", $line); + $spk = $toks[0]; + $gender = lc $toks[1]; + if ($gender eq "f" or $gender eq "m") { + print GNDR "$spk $gender\n"; + } +} + +$num_good_files = 0; +$num_bad_files = 0; +while () { + chomp; + $line = $_; + @toks = split(",", $line); + $call_id = $toks[0]; + ($call_date, $call_time) = split(/_/, $toks[1]); + $sid_A = $toks[4]; + $sid_B = $toks[12]; + if (-f $call2sph[$call_id]) { + $utt_A = "${sid_A}_MX6_${call_id}_A"; + $utt_B = "${sid_B}_MX6_${call_id}_B"; + print SPKR "${utt_A} $sid_A\n"; + print SPKR "${utt_B} $sid_B\n"; + print WAV "${utt_A} sph2pipe -f wav -p -c 1 $call2sph[$call_id] |\n"; + print WAV "${utt_B} sph2pipe -f wav -p -c 2 $call2sph[$call_id] |\n"; + $num_good_files++; + } else { + print STDERR "Sphere file for $call_id doesn't exist\n"; + $num_bad_files++; + } +} + +print STDERR "Processed $num_good_files utterances; $num_bad_files had missing sphere data.\n"; + +close(SPHLIST) || die; +close(SUBJECTS) || die; +close(GNDR) || die; +close(SPKR) || die; +close(WAV) || die; +close(META) || die; + +if (system( + "utils/utt2spk_to_spk2utt.pl $out_dir/utt2spk >$out_dir/spk2utt") != 0) { + die "Error creating spk2utt file in directory $out_dir"; +} + +system("utils/fix_data_dir.sh $out_dir"); +if (system("utils/validate_data_dir.sh --no-text --no-feats $out_dir") != 0) { + die "Error validating directory $out_dir"; +} diff --git a/egs/multi_en/s5/local/make_mx6_mic.pl b/egs/multi_en/s5/local/make_mx6_mic.pl new file mode 100755 index 00000000000..f021140f235 --- /dev/null +++ b/egs/multi_en/s5/local/make_mx6_mic.pl @@ -0,0 +1,96 @@ +#!/usr/bin/perl +use warnings; #sed replacement for -w perl parameter +# Copyright 2017 David Snyder +# Apache 2.0 +# Prepares Mixer 6 (LDC2013S03) speech from a specified microphone and +# downsamples it to 8k. + +if (@ARGV != 3) { + print STDERR "Usage: $0 \n"; + print STDERR "e.g. $0 /export/corpora5/LDC/LDC2013S03 02 data/\n"; + exit(1); +} +($db_base, $ch, $out_dir) = @ARGV; + +@bad_channels = ("01", "03", "14"); +if (/$ch/i ~~ @bad_channels) { + print STDERR "Bad channel $ch\n"; + exit(1); +} + +if (! -d "$db_base/mx6_speech/data/pcm_flac/CH$ch/") { + print STDERR "Directory $db_base/mx6_speech/data/pcm_flac/CH$ch/ doesn't exist\n"; + exit(1); +} + +$out_dir = "$out_dir/mx6_mic_$ch"; +if (system("mkdir -p $out_dir")) { + print STDERR "Error making directory $out_dir\n"; + exit(1); +} + +if (system("mkdir -p $out_dir") != 0) { + print STDERR "Error making directory $out_dir\n"; + exit(1); +} + +open(SUBJECTS, "<$db_base/mx6_speech/docs/mx6_subjs.csv") || die "cannot open $$db_base/mx6_speech/docs/mx6_subjs.csv"; +open(SPKR, ">$out_dir/utt2spk") || die "Could not open the output file $out_dir/utt2spk"; +open(GNDR, ">$out_dir/spk2gender") || die "Could not open the output file $out_dir/spk2gender"; +open(WAV, ">$out_dir/wav.scp") || die "Could not open the output file $out_dir/wav.scp"; +open(META, "<$db_base/mx6_speech/docs/mx6_ivcomponents.csv") || die "cannot open $db_base/mx6_speech/docs/mx6_ivcomponents.csv"; + +while () { + chomp; + $line = $_; + @toks = split(",", $line); + $spk = $toks[0]; + $gender = lc $toks[1]; + if ($gender eq "f" or $gender eq "m") { + print GNDR "$spk $gender\n"; + } +} + +$num_good_files = 0; +$num_bad_files = 0; +while () { + chomp; + $line = $_; + @toks = split(",", $line); + $flac = "$db_base/mx6_speech/data/pcm_flac/CH$ch/$toks[0]_CH$ch.flac"; + $t1 = $toks[7]; + $t2 = $toks[8]; + @toks2 = split(/_/, $toks[0]); + if (-f $flac) { + if ($t2 - $t1 < 0.01) { # recordings with errors have 0 as the time stamps + $num_bad_files++; + next; + } + $spk = $toks2[3]; + $utt = "${spk}_MX6_$toks2[0]_$toks2[1]_$ch"; + print SPKR "${utt} $spk\n"; + print WAV "${utt} sox -t flac $flac -r 8k -t wav - trim $t1 =$t2 |\n"; + $num_good_files++; + } else { + print STDERR "File $flac doesn't exist\n"; + $num_bad_files++; + } +} + +print STDERR "Processed $num_good_files utterances; $num_bad_files had missing flac data.\n"; + +close(SUBJECTS) || die; +close(GNDR) || die; +close(SPKR) || die; +close(WAV) || die; +close(META) || die; + +if (system( + "utils/utt2spk_to_spk2utt.pl $out_dir/utt2spk >$out_dir/spk2utt") != 0) { + die "Error creating spk2utt file in directory $out_dir"; +} + +system("utils/fix_data_dir.sh $out_dir"); +if (system("utils/validate_data_dir.sh --no-text --no-feats $out_dir") != 0) { + die "Error validating directory $out_dir"; +} diff --git a/egs/multi_en/s5/local/make_sre.pl b/egs/multi_en/s5/local/make_sre.pl new file mode 100755 index 00000000000..d6e1abf94b0 --- /dev/null +++ b/egs/multi_en/s5/local/make_sre.pl @@ -0,0 +1,75 @@ +#!/usr/bin/perl +use warnings; #sed replacement for -w perl parameter +# +# Copyright 2015 David Snyder +# Apache 2.0. +# Usage: make_sre.pl + +if (@ARGV != 4) { + print STDERR "Usage: $0 \n"; + print STDERR "e.g. $0 /export/corpora5/LDC/LDC2006S44 sre2004 sre_ref data/sre2004\n"; + exit(1); +} + +($db_base, $sre_year, $sre_ref_filename, $out_dir) = @ARGV; +%utt2sph = (); +%spk2gender = (); + +$tmp_dir = "$out_dir/tmp"; +if (system("mkdir -p $tmp_dir") != 0) { + die "Error making directory $tmp_dir"; +} + +if (system("find -L $db_base -name '*.sph' > $tmp_dir/sph.list") != 0) { + die "Error getting list of sph files"; +} +open(WAVLIST, "<$tmp_dir/sph.list") or die "cannot open wav list"; + +while() { + chomp; + $sph = $_; + @A1 = split("/",$sph); + @A2 = split("[./]",$A1[$#A1]); + $uttId=$A2[0]; + $utt2sph{$uttId} = $sph; +} + +open(GNDR,">", "$out_dir/spk2gender") or die "Could not open the output file $out_dir/spk2gender"; +open(SPKR,">", "$out_dir/utt2spk") or die "Could not open the output file $out_dir/utt2spk"; +open(WAV,">", "$out_dir/wav.scp") or die "Could not open the output file $out_dir/wav.scp"; +open(SRE_REF, "<$sre_ref_filename") or die "Cannot open SRE reference."; +while () { + chomp; + ($speaker, $gender, $other_sre_year, $utt_id, $channel) = split(" ", $_); + $channel_num = "1"; + if ($channel eq "A") { + $channel_num = "1"; + } else { + $channel_num = "2"; + } + $channel = lc $channel; + if (($other_sre_year eq "sre20$sre_year") and (exists $utt2sph{$utt_id})) { + $full_utt_id = "$speaker-sre$sre_year-$utt_id-$channel"; + $spk2gender{"$speaker"} = $gender; + print WAV "$full_utt_id"," sph2pipe -f wav -p -c $channel_num $utt2sph{$utt_id} |\n"; + print SPKR "$full_utt_id $speaker","\n"; + } +} +foreach $speaker (keys %spk2gender) { + print GNDR "$speaker $spk2gender{$speaker}\n"; +} + +close(GNDR) || die; +close(SPKR) || die; +close(WAV) || die; +close(SRE_REF) || die; + +if (system( + "utils/utt2spk_to_spk2utt.pl $out_dir/utt2spk >$out_dir/spk2utt") != 0) { + die "Error creating spk2utt file in directory $out_dir"; +} + +system("utils/fix_data_dir.sh $out_dir"); +if (system("utils/validate_data_dir.sh --no-text --no-feats $out_dir") != 0) { + die "Error validating directory $out_dir"; +} diff --git a/egs/multi_en/s5/local/make_swbd2_phase1.pl b/egs/multi_en/s5/local/make_swbd2_phase1.pl new file mode 100755 index 00000000000..71b26b55de5 --- /dev/null +++ b/egs/multi_en/s5/local/make_swbd2_phase1.pl @@ -0,0 +1,106 @@ +#!/usr/bin/perl +use warnings; #sed replacement for -w perl parameter +# +# Copyright 2017 David Snyder +# Apache 2.0 + +if (@ARGV != 2) { + print STDERR "Usage: $0 \n"; + print STDERR "e.g. $0 /export/corpora3/LDC/LDC98S75 data/swbd2_phase1_train\n"; + exit(1); +} +($db_base, $out_dir) = @ARGV; + +if (system("mkdir -p $out_dir")) { + die "Error making directory $out_dir"; +} + +open(CS, "<$db_base/doc/callstat.tbl") || die "Could not open $db_base/doc/callstat.tbl"; +open(GNDR, ">$out_dir/spk2gender") || die "Could not open the output file $out_dir/spk2gender"; +open(SPKR, ">$out_dir/utt2spk") || die "Could not open the output file $out_dir/utt2spk"; +open(WAV, ">$out_dir/wav.scp") || die "Could not open the output file $out_dir/wav.scp"; + +@badAudio = ("3", "4"); + +$tmp_dir = "$out_dir/tmp"; +if (system("mkdir -p $tmp_dir") != 0) { + die "Error making directory $tmp_dir"; +} + +if (system("find $db_base -name '*.sph' > $tmp_dir/sph.list") != 0) { + die "Error getting list of sph files"; +} + +open(WAVLIST, "<$tmp_dir/sph.list") or die "cannot open wav list"; + +%wavs = (); +while() { + chomp; + $sph = $_; + @t = split("/",$sph); + @t1 = split("[./]",$t[$#t]); + $uttId = $t1[0]; + $wavs{$uttId} = $sph; +} + +while () { + $line = $_ ; + @A = split(",", $line); + @A1 = split("[./]",$A[0]); + $wav = $A1[0]; + if (/$wav/i ~~ @badAudio) { + # do nothing + print "Bad Audio = $wav"; + } else { + $spkr1= "sw_" . $A[2]; + $spkr2= "sw_" . $A[3]; + $gender1 = $A[5]; + $gender2 = $A[6]; + if ($gender1 eq "M") { + $gender1 = "m"; + } elsif ($gender1 eq "F") { + $gender1 = "f"; + } else { + die "Unknown Gender in $line"; + } + if ($gender2 eq "M") { + $gender2 = "m"; + } elsif ($gender2 eq "F") { + $gender2 = "f"; + } else { + die "Unknown Gender in $line"; + } + if (-e "$wavs{$wav}") { + $uttId = $spkr1 ."_" . $wav ."_1"; + if (!$spk2gender{$spkr1}) { + $spk2gender{$spkr1} = $gender1; + print GNDR "$spkr1"," $gender1\n"; + } + print WAV "$uttId"," sph2pipe -f wav -p -c 1 $wavs{$wav} |\n"; + print SPKR "$uttId"," $spkr1","\n"; + + $uttId = $spkr2 . "_" . $wav ."_2"; + if (!$spk2gender{$spkr2}) { + $spk2gender{$spkr2} = $gender2; + print GNDR "$spkr2"," $gender2\n"; + } + print WAV "$uttId"," sph2pipe -f wav -p -c 2 $wavs{$wav} |\n"; + print SPKR "$uttId"," $spkr2","\n"; + } else { + print STDERR "Missing $wavs{$wav} for $wav\n"; + } + } +} + +close(WAV) || die; +close(SPKR) || die; +close(GNDR) || die; +if (system("utils/utt2spk_to_spk2utt.pl $out_dir/utt2spk >$out_dir/spk2utt") != 0) { + die "Error creating spk2utt file in directory $out_dir"; +} +if (system("utils/fix_data_dir.sh $out_dir") != 0) { + die "Error fixing data dir $out_dir"; +} +if (system("utils/validate_data_dir.sh --no-text --no-feats $out_dir") != 0) { + die "Error validating directory $out_dir"; +} diff --git a/egs/multi_en/s5/local/make_swbd2_phase2.pl b/egs/multi_en/s5/local/make_swbd2_phase2.pl new file mode 100755 index 00000000000..337ab9d9708 --- /dev/null +++ b/egs/multi_en/s5/local/make_swbd2_phase2.pl @@ -0,0 +1,107 @@ +#!/usr/bin/perl +use warnings; #sed replacement for -w perl parameter +# +# Copyright 2013 Daniel Povey +# Apache 2.0 + +if (@ARGV != 2) { + print STDERR "Usage: $0 \n"; + print STDERR "e.g. $0 /export/corpora5/LDC/LDC99S79 data/swbd2_phase2_train\n"; + exit(1); +} +($db_base, $out_dir) = @ARGV; + +if (system("mkdir -p $out_dir")) { + die "Error making directory $out_dir"; +} + +open(CS, "<$db_base/DISC1/doc/callstat.tbl") || die "Could not open $db_base/DISC1/doc/callstat.tbl"; +open(CI, "<$db_base/DISC1/doc/callinfo.tbl") || die "Could not open $db_base/DISC1/doc/callinfo.tbl"; +open(GNDR, ">$out_dir/spk2gender") || die "Could not open the output file $out_dir/spk2gender"; +open(SPKR, ">$out_dir/utt2spk") || die "Could not open the output file $out_dir/utt2spk"; +open(WAV, ">$out_dir/wav.scp") || die "Could not open the output file $out_dir/wav.scp"; + +@badAudio = ("3", "4"); + +$tmp_dir = "$out_dir/tmp"; +if (system("mkdir -p $tmp_dir") != 0) { + die "Error making directory $tmp_dir"; +} + +if (system("find $db_base -name '*.sph' > $tmp_dir/sph.list") != 0) { + die "Error getting list of sph files"; +} + +open(WAVLIST, "<$tmp_dir/sph.list") or die "cannot open wav list"; + +while() { + chomp; + $sph = $_; + @t = split("/",$sph); + @t1 = split("[./]",$t[$#t]); + $uttId=$t1[0]; + $wav{$uttId} = $sph; +} + +while () { + $line = $_ ; + $ci = ; + $ci = ; + @ci = split(",",$ci); + $wav = $ci[0]; + @A = split(",", $line); + if (/$wav/i ~~ @badAudio) { + # do nothing + } else { + $spkr1= "sw_" . $A[2]; + $spkr2= "sw_" . $A[3]; + $gender1 = $A[4]; + $gender2 = $A[5]; + if ($gender1 eq "M") { + $gender1 = "m"; + } elsif ($gender1 eq "F") { + $gender1 = "f"; + } else { + die "Unknown Gender in $line"; + } + if ($gender2 eq "M") { + $gender2 = "m"; + } elsif ($gender2 eq "F") { + $gender2 = "f"; + } else { + die "Unknown Gender in $line"; + } + if (-e "$wav{$wav}") { + $uttId = $spkr1 ."_" . $wav ."_1"; + if (!$spk2gender{$spkr1}) { + $spk2gender{$spkr1} = $gender1; + print GNDR "$spkr1"," $gender1\n"; + } + print WAV "$uttId"," sph2pipe -f wav -p -c 1 $wav{$wav} |\n"; + print SPKR "$uttId"," $spkr1","\n"; + + $uttId = $spkr2 . "_" . $wav ."_2"; + if (!$spk2gender{$spkr2}) { + $spk2gender{$spkr2} = $gender2; + print GNDR "$spkr2"," $gender2\n"; + } + print WAV "$uttId"," sph2pipe -f wav -p -c 2 $wav{$wav} |\n"; + print SPKR "$uttId"," $spkr2","\n"; + } else { + print STDERR "Missing $wav{$wav} for $wav\n"; + } + } +} + +close(WAV) || die; +close(SPKR) || die; +close(GNDR) || die; +if (system("utils/utt2spk_to_spk2utt.pl $out_dir/utt2spk >$out_dir/spk2utt") != 0) { + die "Error creating spk2utt file in directory $out_dir"; +} +if (system("utils/fix_data_dir.sh $out_dir") != 0) { + die "Error fixing data dir $out_dir"; +} +if (system("utils/validate_data_dir.sh --no-text --no-feats $out_dir") != 0) { + die "Error validating directory $out_dir"; +} diff --git a/egs/multi_en/s5/local/make_swbd2_phase3.pl b/egs/multi_en/s5/local/make_swbd2_phase3.pl new file mode 100755 index 00000000000..f27853415a0 --- /dev/null +++ b/egs/multi_en/s5/local/make_swbd2_phase3.pl @@ -0,0 +1,102 @@ +#!/usr/bin/perl +use warnings; #sed replacement for -w perl parameter +# +# Copyright 2013 Daniel Povey +# Apache 2.0 + +if (@ARGV != 2) { + print STDERR "Usage: $0 \n"; + print STDERR "e.g. $0 /export/corpora5/LDC/LDC2002S06 data/swbd2_phase3_train\n"; + exit(1); +} +($db_base, $out_dir) = @ARGV; + +if (system("mkdir -p $out_dir")) { + die "Error making directory $out_dir"; +} + +open(CS, "<$db_base/DISC1/docs/callstat.tbl") || die "Could not open $db_base/DISC1/docs/callstat.tbl"; +open(GNDR, ">$out_dir/spk2gender") || die "Could not open the output file $out_dir/spk2gender"; +open(SPKR, ">$out_dir/utt2spk") || die "Could not open the output file $out_dir/utt2spk"; +open(WAV, ">$out_dir/wav.scp") || die "Could not open the output file $out_dir/wav.scp"; + +@badAudio = ("3", "4"); + +$tmp_dir = "$out_dir/tmp"; +if (system("mkdir -p $tmp_dir") != 0) { + die "Error making directory $tmp_dir"; +} + +if (system("find $db_base -name '*.sph' > $tmp_dir/sph.list") != 0) { + die "Error getting list of sph files"; +} + +open(WAVLIST, "<$tmp_dir/sph.list") or die "cannot open wav list"; +while() { + chomp; + $sph = $_; + @t = split("/",$sph); + @t1 = split("[./]",$t[$#t]); + $uttId=$t1[0]; + $wav{$uttId} = $sph; +} + +while () { + $line = $_ ; + @A = split(",", $line); + $wav = "sw_" . $A[0] ; + if (/$wav/i ~~ @badAudio) { + # do nothing + } else { + $spkr1= "sw_" . $A[3]; + $spkr2= "sw_" . $A[4]; + $gender1 = $A[5]; + $gender2 = $A[6]; + if ($gender1 eq "M") { + $gender1 = "m"; + } elsif ($gender1 eq "F") { + $gender1 = "f"; + } else { + die "Unknown Gender in $line"; + } + if ($gender2 eq "M") { + $gender2 = "m"; + } elsif ($gender2 eq "F") { + $gender2 = "f"; + } else { + die "Unknown Gender in $line"; + } + if (-e "$wav{$wav}") { + $uttId = $spkr1 ."_" . $wav ."_1"; + if (!$spk2gender{$spkr1}) { + $spk2gender{$spkr1} = $gender1; + print GNDR "$spkr1"," $gender1\n"; + } + print WAV "$uttId"," sph2pipe -f wav -p -c 1 $wav{$wav} |\n"; + print SPKR "$uttId"," $spkr1","\n"; + + $uttId = $spkr2 . "_" . $wav ."_2"; + if (!$spk2gender{$spkr2}) { + $spk2gender{$spkr2} = $gender2; + print GNDR "$spkr2"," $gender2\n"; + } + print WAV "$uttId"," sph2pipe -f wav -p -c 2 $wav{$wav} |\n"; + print SPKR "$uttId"," $spkr2","\n"; + } else { + print STDERR "Missing $wav{$wav} for $wav\n"; + } + } +} + +close(WAV) || die; +close(SPKR) || die; +close(GNDR) || die; +if (system("utils/utt2spk_to_spk2utt.pl $out_dir/utt2spk >$out_dir/spk2utt") != 0) { + die "Error creating spk2utt file in directory $out_dir"; +} +if (system("utils/fix_data_dir.sh $out_dir") != 0) { + die "Error fixing data dir $out_dir"; +} +if (system("utils/validate_data_dir.sh --no-text --no-feats $out_dir") != 0) { + die "Error validating directory $out_dir"; +} diff --git a/egs/multi_en/s5/local/make_swbd_cellular1.pl b/egs/multi_en/s5/local/make_swbd_cellular1.pl new file mode 100755 index 00000000000..e30c710e6fa --- /dev/null +++ b/egs/multi_en/s5/local/make_swbd_cellular1.pl @@ -0,0 +1,83 @@ +#!/usr/bin/perl +use warnings; #sed replacement for -w perl parameter +# +# Copyright 2013 Daniel Povey +# Apache 2.0 + +if (@ARGV != 2) { + print STDERR "Usage: $0 \n"; + print STDERR "e.g. $0 /export/corpora5/LDC/LDC2001S13 data/swbd_cellular1_train\n"; + exit(1); +} +($db_base, $out_dir) = @ARGV; + +if (system("mkdir -p $out_dir")) { + die "Error making directory $out_dir"; +} + +open(CS, "<$db_base/doc/swb_callstats.tbl") || die "Could not open $db_base/doc/swb_callstats.tbl"; +open(GNDR, ">$out_dir/spk2gender") || die "Could not open the output file $out_dir/spk2gender"; +open(SPKR, ">$out_dir/utt2spk") || die "Could not open the output file $out_dir/utt2spk"; +open(WAV, ">$out_dir/wav.scp") || die "Could not open the output file $out_dir/wav.scp"; + +@badAudio = ("40019", "45024", "40022"); + +while () { + $line = $_ ; + @A = split(",", $line); + if (/$A[0]/i ~~ @badAudio) { + # do nothing + } else { + $wav = "sw_" . $A[0]; + $spkr1= "sw_" . $A[1]; + $spkr2= "sw_" . $A[2]; + $gender1 = $A[3]; + $gender2 = $A[4]; + if ($A[3] eq "M") { + $gender1 = "m"; + } elsif ($A[3] eq "F") { + $gender1 = "f"; + } else { + die "Unknown Gender in $line"; + } + if ($A[4] eq "M") { + $gender2 = "m"; + } elsif ($A[4] eq "F") { + $gender2 = "f"; + } else { + die "Unknown Gender in $line"; + } + if (-e "$db_base/$wav.sph") { + $uttId = $spkr1 . "-swbdc_" . $wav ."_1"; + if (!$spk2gender{$spkr1}) { + $spk2gender{$spkr1} = $gender1; + print GNDR "$spkr1"," $gender1\n"; + } + print WAV "$uttId"," sph2pipe -f wav -p -c 1 $db_base/$wav.sph |\n"; + print SPKR "$uttId"," $spkr1","\n"; + + $uttId = $spkr2 . "-swbdc_" . $wav ."_2"; + if (!$spk2gender{$spkr2}) { + $spk2gender{$spkr2} = $gender2; + print GNDR "$spkr2"," $gender2\n"; + } + print WAV "$uttId"," sph2pipe -f wav -p -c 2 $db_base/$wav.sph |\n"; + print SPKR "$uttId"," $spkr2","\n"; + } else { + print STDERR "Missing $db_base/$wav.sph\n"; + } + } +} + +close(WAV) || die; +close(SPKR) || die; +close(GNDR) || die; +if (system("utils/utt2spk_to_spk2utt.pl $out_dir/utt2spk >$out_dir/spk2utt") != 0) { + die "Error creating spk2utt file in directory $out_dir"; +} +if (system("utils/fix_data_dir.sh $out_dir") != 0) { + die "Error fixing data dir $out_dir"; +} +if (system("utils/validate_data_dir.sh --no-text --no-feats $out_dir") != 0) { + die "Error validating directory $out_dir"; +} diff --git a/egs/multi_en/s5/local/make_swbd_cellular2.pl b/egs/multi_en/s5/local/make_swbd_cellular2.pl new file mode 100755 index 00000000000..4de954c194c --- /dev/null +++ b/egs/multi_en/s5/local/make_swbd_cellular2.pl @@ -0,0 +1,83 @@ +#!/usr/bin/perl +use warnings; #sed replacement for -w perl parameter +# +# Copyright 2013 Daniel Povey +# Apache 2.0 + +if (@ARGV != 2) { + print STDERR "Usage: $0 \n"; + print STDERR "e.g. $0 /export/corpora5/LDC/LDC2004S07 data/swbd_cellular2_train\n"; + exit(1); +} +($db_base, $out_dir) = @ARGV; + +if (system("mkdir -p $out_dir")) { + die "Error making directory $out_dir"; +} + +open(CS, "<$db_base/docs/swb_callstats.tbl") || die "Could not open $db_base/docs/swb_callstats.tbl"; +open(GNDR, ">$out_dir/spk2gender") || die "Could not open the output file $out_dir/spk2gender"; +open(SPKR, ">$out_dir/utt2spk") || die "Could not open the output file $out_dir/utt2spk"; +open(WAV, ">$out_dir/wav.scp") || die "Could not open the output file $out_dir/wav.scp"; + +@badAudio=("45024", "40022"); + +while () { + $line = $_ ; + @A = split(",", $line); + if (/$A[0]/i ~~ @badAudio) { + # do nothing + } else { + $wav = "sw_" . $A[0]; + $spkr1= "sw_" . $A[1]; + $spkr2= "sw_" . $A[2]; + $gender1 = $A[3]; + $gender2 = $A[4]; + if ($A[3] eq "M") { + $gender1 = "m"; + } elsif ($A[3] eq "F") { + $gender1 = "f"; + } else { + die "Unknown Gender in $line"; + } + if ($A[4] eq "M") { + $gender2 = "m"; + } elsif ($A[4] eq "F") { + $gender2 = "f"; + } else { + die "Unknown Gender in $line"; + } + if (-e "$db_base/data/$wav.sph") { + $uttId = $spkr1 . "-swbdc_" . $wav ."_1"; + if (!$spk2gender{$spkr1}) { + $spk2gender{$spkr1} = $gender1; + print GNDR "$spkr1"," $gender1\n"; + } + print WAV "$uttId"," sph2pipe -f wav -p -c 1 $db_base/data/$wav.sph |\n"; + print SPKR "$uttId"," $spkr1","\n"; + + $uttId = $spkr2 . "-swbdc_" . $wav ."_2"; + if (!$spk2gender{$spkr2}) { + $spk2gender{$spkr2} = $gender2; + print GNDR "$spkr2"," $gender2\n"; + } + print WAV "$uttId"," sph2pipe -f wav -p -c 2 $db_base/data/$wav.sph |\n"; + print SPKR "$uttId"," $spkr2","\n"; + } else { + print STDERR "Missing $db_base/data/$wav.sph\n"; + } + } +} + +close(WAV) || die; +close(SPKR) || die; +close(GNDR) || die; +if (system("utils/utt2spk_to_spk2utt.pl $out_dir/utt2spk >$out_dir/spk2utt") != 0) { + die "Error creating spk2utt file in directory $out_dir"; +} +if (system("utils/fix_data_dir.sh $out_dir") != 0) { + die "Error fixing data dir $out_dir"; +} +if (system("utils/validate_data_dir.sh --no-text --no-feats $out_dir") != 0) { + die "Error validating directory $out_dir"; +} diff --git a/egs/multi_en/s5/local/semisup/chain/tuning/run_tdnn_semisupervised_a.sh b/egs/multi_en/s5/local/semisup/chain/tuning/run_tdnn_semisupervised_a.sh new file mode 100755 index 00000000000..947465dd9a1 --- /dev/null +++ b/egs/multi_en/s5/local/semisup/chain/tuning/run_tdnn_semisupervised_a.sh @@ -0,0 +1,473 @@ +#!/bin/bash + +# Unsupervised set: train_unsup100k_250k +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 3,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=400 +test_nj=50 + +# The following 3 options decide the output directory for semi-supervised +# chain system +# dir=${exp_root}/chain${chain_affix}/tdnn${tdnn_affix} +multi=multi_a +chain_affix= +tdnn_affix=_semisup_5b + +# Data directories +supervised_data=data/multi_a/tri5a +unsupervised_data=data/train_mixer6_1a_seg + +# Input seed system +sup_gmm=tri5a +sup_chain_dir=exp/multi_a/chain/tdnn_5b_sp +sup_lat_dir=exp/multi_a/tri5a_lats_nodup_sp +sup_tree_dir=exp/multi_a/chain/tri5a_tree +sup_ivector_dir=exp/multi_a/nnet3/ivectors_multi_a/tri5a_sp +sup_ivector_root_dir=exp/multi_a/nnet3 + +train_new_ivector=false +nnet3_affix= # affix for nnet3 -- relates to i-vector used + # Applicable if training a new i-vector extractor + +# Unsupervised options +unsup_decode_opts="--frames-per-chunk 160 --extra-left-context 0 --extra-right-context 0" +unsup_frames_per_eg=150 # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +phone_insertion_penalty= + +# Semi-supervised options +supervision_weights=1.0,1.0 +lm_weights=3,1 +num_copies= +sup_egs_dir= +unsup_egs_dir= +unsup_egs_opts= + +remove_egs=false +common_egs_dir= + +hidden_dim=1536 +hidden_dim_l=1792 +bottleneck_dim=320 + +apply_deriv_weights=true +use_smart_splitting=true + +# training options +num_epochs=2 +initial_effective_lrate=0.0005 +final_effective_lrate=0.00005 +max_param_change=2.0 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +xent_regularize=0.1 +dropout_schedule='0,0@0.20,0.5@0.50,0' + +decode_iter= +decode_dir_affix= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +exp_root=exp/$multi + +RANDOM=0 + +#if ! cuda-compiled; then +# cat </dev/null || true +utils/fix_data_dir.sh ${unsupervised_data}_sp_hires || exit 1 + +unsupervised_set=$(basename $unsupervised_data) +if [ $stage -le 2 ]; then + steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2 \ + ${unsupervised_data}_sp_hires ${unsupervised_data}_sp_max2_hires + + steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj $nj \ + ${unsupervised_data}_sp_max2_hires $sup_ivector_root_dir/extractor \ + $sup_ivector_root_dir/ivectors_${unsupervised_set}_sp_hires || exit 1 +fi + +split_nj=400 +if [ $stage -le 5 ]; then + echo "$0: getting the decoding lattices for the unsupervised subset using the chain model at: $sup_chain_dir" + steps/nnet3/decode_semisup.sh --num-threads 4 --sub-split $nj --nj $split_nj --cmd "$decode_cmd" \ + --acwt 1.0 --post-decode-acwt 10.0 --write-compact false --skip-scoring true \ + --online-ivector-dir $sup_ivector_root_dir/ivectors_${unsupervised_set}_sp_hires \ + $unsup_decode_opts --keep-subsplit true \ + --scoring-opts "--min-lmwt 10 --max-lmwt 10" --word-determinize false \ + $graphdir ${unsupervised_data}_sp_hires $sup_chain_dir/decode${decode_affix}_${unsupervised_set}_sp || exit 1 +fi + +if [ $stage -le 6 ]; then + steps/lmrescore_const_arpa_undeterminized.sh --cmd "$decode_cmd" \ + --scoring-opts "--min-lmwt 10 --max-lmwt 10" --skip-scoring true \ + --write-compact true --acwt 0.1 --beam 8.0 --keep-subsplit true \ + $unsup_decode_lang $unsup_rescore_lang \ + ${unsupervised_data}_sp_hires \ + $sup_chain_dir/decode${decode_affix}_${unsupervised_set}_sp \ + $sup_chain_dir/decode${decode_affix}_${unsupervised_set}_sp${unsup_rescore_graph_affix} || exit 1 +fi + +ln -sf ../final.mdl $sup_chain_dir/decode${decode_affix}_${unsupervised_set}_sp${unsup_rescore_graph_affix}/ || true + +frame_subsampling_factor=1 +if [ -f $sup_chain_dir/frame_subsampling_factor ]; then + frame_subsampling_factor=`cat $sup_chain_dir/frame_subsampling_factor` +fi + +if [ $stage -le 7 ]; then + steps/nnet3/merge_subsplit_lattices.sh \ + --cmd "${train_cmd}" --skip-scoring true --skip-diagnostics true \ + $unsup_decode_lang \ + ${unsupervised_data}_sp_hires \ + $sup_chain_dir/decode${decode_affix}_${unsupervised_set}_sp${unsup_rescore_graph_affix} || exit 1 +fi + +unsup_lat_dir=${sup_chain_dir}/decode${decode_affix}_${unsupervised_set}_sp${unsup_rescore_graph_affix} + + +if [ $stage -le 8 ]; then + steps/best_path_weights.sh --cmd "${train_cmd}" --acwt 0.1 \ + ${unsupervised_data}_sp_hires \ + $sup_chain_dir/decode${decode_affix}_${unsupervised_set}_sp${unsup_rescore_graph_affix} \ + $sup_chain_dir/best_path${decode_affix}_${unsupervised_set}_sp${unsup_rescore_graph_affix} || exit 1 +fi +echo $frame_subsampling_factor > $sup_chain_dir/best_path${decode_affix}_${unsupervised_set}_sp${unsup_rescore_graph_affix}/frame_subsampling_factor + +cmvn_opts=`cat $sup_chain_dir/cmvn_opts` || exit 1 + +if [ ! -f $sup_tree_dir/final.mdl ]; then + echo "$0: $sup_tree_dir/final.mdl does not exist." + exit 1 +fi + +diff $sup_tree_dir/tree $sup_chain_dir/tree || { echo "$0: $sup_tree_dir/tree and $sup_chain_dir/tree differ"; exit 1; } + +dir=$exp_root/chain${chain_affix}/tdnn${tdnn_affix} + +#if [ $stage -le 9 ]; then +# steps/subset_ali_dir.sh --cmd "$train_cmd" \ +# data/${unsupervised_set} data/${unsupervised_set}_sp_hires \ +# $sup_chain_dir/best_path_${unsupervised_set}_sp${decode_affix} \ +# $sup_chain_dir/best_path_${unsupervised_set}${decode_affix} +# echo $frame_subsampling_factor > $sup_chain_dir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor +#fi + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${sup_tree_dir} ${sup_chain_dir}/best_path${decode_affix}_${unsupervised_set}_sp${unsup_rescore_graph_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $sup_tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + opts="l2-regularize=0.0015 dropout-proportion=0.0 dropout-per-dim=true dropout-per-dim-continuous=true" + linear_opts="l2-regularize=0.0015 orthonormal-constraint=-1.0" + output_opts="l2-regularize=0.001" + + mkdir -p $dir/configs + + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-dropout-layer name=tdnn1 $opts dim=$hidden_dim + linear-component name=tdnn2l0 dim=$bottleneck_dim $linear_opts input=Append(-1,0) + linear-component name=tdnn2l dim=$bottleneck_dim $linear_opts input=Append(-1,0) + relu-batchnorm-dropout-layer name=tdnn2 $opts input=Append(0,1) dim=$hidden_dim + linear-component name=tdnn3l dim=$bottleneck_dim $linear_opts input=Append(-1,0) + relu-batchnorm-dropout-layer name=tdnn3 $opts dim=$hidden_dim input=Append(0,1) + linear-component name=tdnn4l0 dim=$bottleneck_dim $linear_opts input=Append(-1,0) + linear-component name=tdnn4l dim=$bottleneck_dim $linear_opts input=Append(0,1) + relu-batchnorm-dropout-layer name=tdnn4 $opts input=Append(0,1) dim=$hidden_dim + linear-component name=tdnn5l dim=$bottleneck_dim $linear_opts + relu-batchnorm-dropout-layer name=tdnn5 $opts dim=$hidden_dim input=Append(0, tdnn3l) + linear-component name=tdnn6l0 dim=$bottleneck_dim $linear_opts input=Append(-3,0) + linear-component name=tdnn6l dim=$bottleneck_dim $linear_opts input=Append(-3,0) + relu-batchnorm-dropout-layer name=tdnn6 $opts input=Append(0,3) dim=$hidden_dim_l + linear-component name=tdnn7l0 dim=$bottleneck_dim $linear_opts input=Append(-3,0) + linear-component name=tdnn7l dim=$bottleneck_dim $linear_opts input=Append(0,3) + relu-batchnorm-dropout-layer name=tdnn7 $opts input=Append(0,3,tdnn6l,tdnn4l,tdnn2l) dim=$hidden_dim + linear-component name=tdnn8l0 dim=$bottleneck_dim $linear_opts input=Append(-3,0) + linear-component name=tdnn8l dim=$bottleneck_dim $linear_opts input=Append(0,3) + relu-batchnorm-dropout-layer name=tdnn8 $opts input=Append(0,3) dim=$hidden_dim_l + linear-component name=tdnn9l0 dim=$bottleneck_dim $linear_opts input=Append(-3,0) + linear-component name=tdnn9l dim=$bottleneck_dim $linear_opts input=Append(-3,0) + relu-batchnorm-dropout-layer name=tdnn9 $opts input=Append(0,3,tdnn8l,tdnn6l,tdnn5l) dim=$hidden_dim + linear-component name=tdnn10l0 dim=$bottleneck_dim $linear_opts input=Append(-3,0) + linear-component name=tdnn10l dim=$bottleneck_dim $linear_opts input=Append(0,3) + relu-batchnorm-dropout-layer name=tdnn10 $opts input=Append(0,3) dim=$hidden_dim_l + linear-component name=tdnn11l0 dim=$bottleneck_dim $linear_opts input=Append(-3,0) + linear-component name=tdnn11l dim=$bottleneck_dim $linear_opts input=Append(-3,0) + relu-batchnorm-dropout-layer name=tdnn11 $opts input=Append(0,3,tdnn10l,tdnn9l,tdnn7l) dim=$hidden_dim + linear-component name=prefinal-l dim=$bottleneck_dim $linear_opts + + relu-batchnorm-layer name=prefinal-chain input=prefinal-l $opts dim=$hidden_dim_l + linear-component name=prefinal-chain-l dim=$bottleneck_dim $linear_opts + batchnorm-component name=prefinal-chain-batchnorm + output-layer name=output include-log-softmax=false dim=$num_targets $output_opts + + relu-batchnorm-layer name=prefinal-xent input=prefinal-l $opts dim=$hidden_dim_l + linear-component name=prefinal-xent-l dim=$bottleneck_dim $linear_opts + batchnorm-component name=prefinal-xent-batchnorm + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts + + output name=output-0 input=output.affine + output name=output-1 input=output.affine + + output name=output-0-xent input=output-xent.log-softmax + output name=output-1-xent input=output-xent.log-softmax +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +egs_left_context=`perl -e "print int($model_left_context + $frame_subsampling_factor / 2)"` +egs_right_context=`perl -e "print int($model_right_context + $frame_subsampling_factor / 2)"` + +supervised_set=$(basename $supervised_data) +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set}_sp + frames_per_eg=$(cat $sup_chain_dir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $egs_left_context --right-context $egs_right_context \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $sup_ivector_dir \ + --generate-egs-scp true --constrained false \ + ${supervised_data}_sp_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +if [ -z "$unsup_egs_dir" ]; then + [ -z $unsup_frames_per_eg ] && [ ! -z "$frames_per_eg" ] && unsup_frames_per_eg=$frames_per_eg + unsup_egs_dir=$dir/egs_${unsupervised_set}_sp + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + if $use_smart_splitting; then + get_egs_script=steps/nnet3/chain/get_egs_split.sh + else + get_egs_script=steps/nnet3/chain/get_egs.sh + fi + + $get_egs_script --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $egs_left_context --right-context $egs_right_context \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $sup_chain_dir/best_path${decode_affix}_${unsupervised_set}_sp${unsup_rescore_graph_affix}/weights.scp \ + --online-ivector-dir $sup_ivector_root_dir/ivectors_${unsupervised_set}_sp_hires \ + --generate-egs-scp true --constrained false $unsup_egs_opts \ + ${unsupervised_data}_sp_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/comb_egs + +if [ $stage -le 14 ]; then + steps/nnet3/chain/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --block-size 64 \ + --lang2weight $supervision_weights --lang2num-copies "$num_copies" \ + 2 $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $sup_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.0 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.add-option="--optimization.memory-compression-level=2" \ + --egs.dir "$comb_egs_dir" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.add-option="--optimization.memory-compression-level=2" \ + --trainer.optimization.num-jobs-initial $num_jobs_initial \ + --trainer.optimization.num-jobs-final $num_jobs_final \ + --trainer.optimization.initial-effective-lrate $initial_effective_lrate \ + --trainer.optimization.final-effective-lrate $final_effective_lrate \ + --trainer.max-param-change $max_param_change \ + --cleanup.remove-egs false \ + --feat-dir ${supervised_data}_sp_hires \ + --tree-dir $sup_tree_dir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph${test_graph_affix} +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 $test_lang $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + + for decode_set in rt03 eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $sup_ivector_root_dir/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_${decode_suff} || exit 1; + if [ ! -z "$test_rescore_lang" ]; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + $test_lang $test_rescore_lang data/${decode_set}_hires \ + $dir/decode_${decode_set}${test_graph_affix} \ + $dir/decode_${decode_set}${test_rescore_graph_affix} || exit 1; + fi + ) & + done +fi + +test_online_decoding=true +lang=data/lang_${multi}_${gmm}_fsh_sw1_tg +if $test_online_decoding && [ $stage -le 16 ]; then + # note: if the features change (e.g. you add pitch features), you will have to + # change the options of the following command line. + steps/online/nnet3/prepare_online_decoding.sh \ + --mfcc-config conf/mfcc_hires.conf \ + $lang $sup_ivector_root_dir/extractor $dir ${dir}_online + + rm $dir/.error 2>/dev/null || true + for decode_set in rt03 eval2000; do + ( + # note: we just give it "$decode_set" as it only uses the wav.scp, the + # feature type does not matter. + + steps/online/nnet3/decode.sh --nj 50 --cmd "$decode_cmd" $iter_opts \ + --acwt 1.0 --post-decode-acwt 10.0 \ + $graph_dir data/${decode_set}_hires \ + ${dir}_online/decode_${decode_set}${test_graph_affix} || exit 1; + if $rescore; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + $test_lang $test_rescore_lang data/${decode_set}_hires \ + ${dir}_online/decode_${decode_set}${test_graph_affix} \ + ${dir}_online/decode_${decode_set}${test_rescore_graph_affix} || exit 1; + fi + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in online decoding" + exit 1 + fi +fi + +exit 0; diff --git a/egs/multi_en/s5/local/semisup/run_mixer6.sh b/egs/multi_en/s5/local/semisup/run_mixer6.sh new file mode 100755 index 00000000000..6bdb74eb9bb --- /dev/null +++ b/egs/multi_en/s5/local/semisup/run_mixer6.sh @@ -0,0 +1,255 @@ +#!/bin/bash + +# Copyright 2017 Vimal Manohar +# Apache 2.0 + +# This script demonstrates semi-supervised training using 50 hours of +# supervised data and 250 hours of unsupervised data. +# We assume the supervised data is in data/train_sup and unsupervised data +# is in data/train_unsup100k_250k. +# For LM training, we assume there is data/train/text, from which +# we will exclude the utterances contained in the unsupervised set. +# We use all 300 hours of semi-supervised data for i-vector extractor training. + +# This differs from run_100k.sh, which uses only 100 hours supervised data for +# both i-vector extractor training and LM training. + +. ./cmd.sh +. ./path.sh + +set -o pipefail +exp_root=exp/multi_a +supervised_data=data/multi_a/tri5b +stage=0 + +. utils/parse_options.sh + +if [ $stage -le 0 ]; then + local/mixer6_calls_prepare_data.py /export/LDC/LDC2013S03/mx6_speech data/local/mixer6 + local/mixer6_format_data.sh data/local/mixer6 data/train_mixer6 +fi + +mkdir -p sad_model + +if [ $stage -le 1 ]; then + ( + cd sad_model + wget http://kaldi-asr.org/models/0004_tdnn_stats_asr_sad_1a.tar.gz + tar -xzvf 0004_tdnn_stats_asr_sad_1a.tar.gz + ) +fi + +if [ $stage -le 2 ]; then + steps/segmentation/detect_speech_activity.sh --stage $sad_stage \ + --cmd "$train_cmd" --nj 400 --convert-data-dir-to-whole false \ + --extra-left-context 79 --extra-right-context 21 \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + --frames-per-chunk 150 --mfcc-config sad_model/conf/mfcc_hires.conf \ + data/train_mixer6 sad_model/exp/segmentation_1a/tdnn_stats_asr_sad_1a \ + sad_model/mfcc_hires sad_model/exp/segmentation_1a/tdnn_stats_asr_sad_1a \ + data/train_mixer6_1a +fi + +for f in data/train_mixer6_1a_seg/utt2spk \ + data/train_mixer6_1a_seg/feats.scp; do + if [ ! -f $f ]; then + echo "$0: Could not find $f" + exit 1 + fi +done + +############################################################################### +# Prepare the 50 hours supervised set and subsets for initial GMM training +############################################################################### + +if [ $stage -le 0 ]; then + utils/subset_data_dir.sh --speakers data/train_sup 50000 data/train_sup50k || exit 1 + utils/subset_data_dir.sh --shortest data/train_sup50k 25000 data/train_sup50k_short || exit 1 + utils/subset_data_dir.sh --speakers data/train_sup50k 30000 data/train_sup50k_30k || exit 1; +fi + +############################################################################### +# GMM system training using 50 hours supervised data +############################################################################### + +if [ $stage -le 1 ]; then + steps/train_mono.sh --nj 10 --cmd "$train_cmd" \ + data/train_sup50k_short data/lang $exp_root/mono0a || exit 1 +fi + +if [ $stage -le 2 ]; then + steps/align_si.sh --nj 30 --cmd "$train_cmd" \ + data/train_sup50k_30k data/lang $exp_root/mono0a $exp_root/mono0a_ali || exit 1 + + steps/train_deltas.sh --cmd "$train_cmd" \ + 2500 20000 data/train_sup50k_30k data/lang $exp_root/mono0a_ali $exp_root/tri1 || exit 1 + + (utils/mkgraph.sh data/lang_test $exp_root/tri1 $exp_root/tri1/graph + steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ + $exp_root/tri1/graph data/dev $exp_root/tri1/decode_dev)& +fi + +if [ $stage -le 3 ]; then + steps/align_si.sh --nj 30 --cmd "$train_cmd" \ + data/train_sup50k_30k data/lang $exp_root/tri1 $exp_root/tri1_ali || exit 1; + + steps/train_deltas.sh --cmd "$train_cmd" \ + 2500 20000 data/train_sup50k_30k data/lang $exp_root/tri1_ali $exp_root/tri2 || exit 1 + + (utils/mkgraph.sh data/lang_test $exp_root/tri2 $exp_root/tri2/graph + steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ + $exp_root/tri2/graph data/dev $exp_root/tri2/decode_dev)& +fi + +if [ $stage -le 4 ]; then + steps/align_si.sh --nj 30 --cmd "$train_cmd" \ + data/train_sup50k data/lang $exp_root/tri2 $exp_root/tri2_ali || exit 1; + + steps/train_lda_mllt.sh --cmd "$train_cmd" \ + 4000 30000 data/train_sup50k data/lang $exp_root/tri2_ali $exp_root/tri3a || exit 1; + + (utils/mkgraph.sh data/lang_test $exp_root/tri3a $exp_root/tri3a/graph + steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ + $exp_root/tri3a/graph data/dev $exp_root/tri3a/decode_dev)& +fi + +if [ $stage -le 5 ]; then + steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \ + data/train_sup50k data/lang $exp_root/tri3a $exp_root/tri3a_ali || exit 1; + + steps/train_sat.sh --cmd "$train_cmd" \ + 4000 50000 data/train_sup50k data/lang $exp_root/tri3a_ali $exp_root/tri4a || exit 1; + + ( + utils/mkgraph.sh data/lang_test $exp_root/tri4a $exp_root/tri4a/graph + steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ + $exp_root/tri4a/graph data/dev $exp_root/tri4a/decode_dev + )& +fi + +############################################################################### +# Prepare semi-supervised train set +############################################################################### + +if [ $stage -le 6 ]; then + utils/combine_data.sh data/semisup50k_100k_250k \ + data/train_sup50k data/train_unsup100k_250k || exit 1 +fi + +############################################################################### +# Train LM on all the text in data/train/text, but excluding the +# utterances in the unsupervised set +############################################################################### + +if [ $stage -le 7 ]; then + mkdir -p data/local/pocolm_ex250k + + utils/filter_scp.pl --exclude data/train_unsup100k_250k/utt2spk \ + data/train/text > data/local/pocolm_ex250k/text.tmp + + if [ ! -f data/lang_test_poco_ex250k_big/G.carpa ]; then + local/fisher_train_lms_pocolm.sh \ + --text data/local/pocolm_ex250k/text.tmp \ + --dir data/local/pocolm_ex250k + + local/fisher_create_test_lang.sh \ + --arpa-lm data/local/pocolm_ex250k/data/arpa/4gram_small.arpa.gz \ + --dir data/lang_test_poco_ex250k + + utils/build_const_arpa_lm.sh \ + data/local/pocolm_ex250k/data/arpa/4gram_big.arpa.gz \ + data/lang_test_poco_ex250k data/lang_test_poco_ex250k_big + fi +fi + +############################################################################### +# Prepare lang directories with UNK modeled using phone LM +############################################################################### + +if [ $stage -le 8 ]; then + local/run_unk_model.sh || exit 1 + + for lang_dir in data/lang_test_poco_ex250k; do + rm -r ${lang_dir}_unk ${lang_dir}_unk_big 2>/dev/null || true + cp -rT data/lang_unk ${lang_dir}_unk + cp ${lang_dir}/G.fst ${lang_dir}_unk/G.fst + cp -rT data/lang_unk ${lang_dir}_unk_big + cp ${lang_dir}_big/G.carpa ${lang_dir}_unk_big/G.carpa; + done +fi + +############################################################################### +# Train seed chain system using 50 hours supervised data. +# Here we train i-vector extractor on combined supervised and unsupervised data +############################################################################### + +if [ $stage -le 9 ]; then + local/semisup/chain/run_tdnn.sh \ + --train-set train_sup50k \ + --ivector-train-set semisup50k_100k_250k \ + --nnet3-affix _semi50k_100k_250k \ + --chain-affix _semi50k_100k_250k \ + --tdnn-affix _1a --tree-affix bi_a \ + --gmm tri4a --exp-root $exp_root || exit 1 + + # WER on dev 21.41 + # WER on test 21.03 + # Final train prob -0.1035 + # Final valid prob -0.1667 + # Final train prob (xent) -1.5926 + # Final valid prob (xent) -1.7990 +fi + +############################################################################### +# Semi-supervised training using 50 hours supervised data and +# 250 hours unsupervised data. We use i-vector extractor, tree, lattices +# and seed chain system from the previous stage. +############################################################################### + +if [ $stage -le 10 ]; then + local/semisup/chain/run_tdnn_50k_semisupervised.sh \ + --supervised-set train_sup50k \ + --unsupervised-set train_unsup100k_250k \ + --sup-chain-dir $exp_root/chain_semi50k_100k_250k/tdnn_1a_sp \ + --sup-lat-dir $exp_root/chain_semi50k_100k_250k/tri4a_train_sup50k_sp_unk_lats \ + --sup-tree-dir $exp_root/chain_semi50k_100k_250k/tree_bi_a \ + --ivector-root-dir $exp_root/nnet3_semi50k_100k_250k \ + --chain-affix _semi50k_100k_250k \ + --tdnn-affix _semisup_1a \ + --exp-root $exp_root || exit 1 + + # WER on dev 18.98 + # WER on test 18.85 + # Final output-0 train prob -0.1381 + # Final output-0 valid prob -0.1723 + # Final output-0 train prob (xent) -1.3676 + # Final output-0 valid prob (xent) -1.4589 + # Final output-1 train prob -0.7671 + # Final output-1 valid prob -0.7714 + # Final output-1 train prob (xent) -1.1480 + # Final output-1 valid prob (xent) -1.2382 +fi + +############################################################################### +# Oracle system trained on combined 300 hours including both supervised and +# unsupervised sets. We use i-vector extractor, tree, and GMM trained +# on only the supervised for fair comparison to semi-supervised experiments. +############################################################################### + +if [ $stage -le 11 ]; then + local/semisup/chain/run_tdnn.sh \ + --train-set semisup50k_100k_250k \ + --nnet3-affix _semi50k_100k_250k \ + --chain-affix _semi50k_100k_250k \ + --common-treedir $exp_root/chain_semi50k_100k_250k/tree_bi_a \ + --tdnn-affix 1a_oracle --nj 100 \ + --gmm tri4a --exp-root $exp_root \ + --stage 9 || exit 1 + + # WER on dev 17.55 + # WER on test 17.72 + # Final output train prob -0.1155 + # Final output valid prob -0.1510 + # Final output train prob (xent) -1.7458 + # Final output valid prob (xent) -1.9045 +fi diff --git a/egs/multi_en/s5/local/semisup/run_semisup.sh b/egs/multi_en/s5/local/semisup/run_semisup.sh new file mode 100644 index 00000000000..1b1ac29da62 --- /dev/null +++ b/egs/multi_en/s5/local/semisup/run_semisup.sh @@ -0,0 +1,108 @@ +#!/bin/bash + +stage=0 +nj=800 +sad_stage=5 + +. utils/parse_options.sh + +. ./path.sh +. ./cmd.sh + +if [ $stage -le 0 ]; then + # Prepare SWBD corpora. + local/make_swbd2_phase1.pl /export/LDC/LDC98S75 \ + data/swbd2_phase1_train + local/make_swbd2_phase2.pl /export/LDC/LDC99S79 \ + data/swbd2_phase2_train + local/make_swbd2_phase3.pl /export/LDC/LDC2002S06 \ + data/swbd2_phase3_train + local/make_swbd_cellular1.pl /export/LDC/LDC2001S13 \ + data/swbd_cellular1_train + local/make_swbd_cellular2.pl /export/LDC/LDC2004S07 \ + data/swbd_cellular2_train +fi + +if [ $stage -le 1 ]; then + mkdir -p data/local/sre + wget -P data/local/sre http://www.openslr.org/resources/15/speaker_list.tgz + tar -C data/local/sre -xvf data/local/sre/speaker_list.tgz + sre_ref=data/local/sre/speaker_list + + local/make_sre.pl /export/LDC/LDC2006S44 04 \ + data/local/sre/speaker_list data/sre2004 +fi + +if [ $stage -le 2 ]; then + local/make_mx6_calls.pl /export/LDC/LDC2013S03 data/local/mx6 + + for mic in 02 04 05 06 07 08 09 10 11 12 13; do + local/make_mx6_mic.pl /export/LDC/LDC2013S03 $mic data/local/mx6 + done + + utils/combine_data.sh data/local/mx6/mx6_mic_04_to_13 \ + data/local/mx6/mx6_mic_{04,05,06,07,08,09,10,11,12,13} +fi + +if [ $stage -le 3 ]; then + utils/data/get_reco2dur.sh \ + --read-entire-file true --cmd "$train_cmd" --nj 32 --permissive true \ + data/local/mx6/mx6_mic_04_to_13 + + utils/copy_data_dir.sh data/local/mx6/mx6_mic_04_to_13 \ + data/local/mx6/mx6_mic_04_to_13_filtered + + utils/filter_scp.pl data/local/mx6/mx6_mic_04_to_13/reco2dur \ + data/local/mx6/mx6_mic_04_to_13/wav.scp > data/local/mx6_mic_04_to_13_filtered/wav.scp + + utils/fix_data_dir.sh \ + data/local/mx6/mx6_mic_04_to_13_filtered + + utils/subset_data_dir.sh \ + data/local/mx6/mx6_mic_04_to_13_filtered 2000 \ + data/local/mx6/mx6_mic_04_to_13_2k +fi + +if [ $stage -le 4 ]; then + utils/combine_data.sh data/mx6_mic \ + data/local/mx6/mx6_mic_02 data/local/mx6/mx6_mic_04_to_13_2k + + utils/copy_data_dir.sh data/local/mx6/mx6_calls data/mx6_calls +fi + +if [ $stage -le 5 ]; then + utils/combine_data.sh data/train_semisup \ + data/swbd2_phase1_train \ + data/swbd2_phase2_train \ + data/swbd2_phase3_train \ + data/swbd_cellular1_train \ + data/swbd_cellular2_train \ + data/sre2004 data/mx6_calls data/mx6_mic +fi + +mkdir -p sad_model +if [ $stage -le 6 ]; then + ( + cd sad_model + wget http://kaldi-asr.org/models/0004_tdnn_stats_asr_sad_1a.tar.gz + tar -xzvf 0004_tdnn_stats_asr_sad_1a.tar.gz + ) +fi + +if [ $stage -le 7 ]; then + steps/segmentation/detect_speech_activity.sh --stage $sad_stage \ + --cmd "$train_cmd" --nj $nj --convert-data-dir-to-whole true \ + --extra-left-context 79 --extra-right-context 21 \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + --frames-per-chunk 150 --mfcc-config sad_model/conf/mfcc_hires.conf \ + data/train_semisup sad_model/exp/segmentation_1a/tdnn_stats_asr_sad_1a \ + sad_model/mfcc_hires sad_model/exp/segmentation_1a/tdnn_stats_asr_sad_1a \ + data/train_semisup_1a +fi + +for f in data/train_semisup_1a_seg/{utt2spk,feats.scp}; do + if [ ! -f $f ]; then + echo "$0: Could not find $f" + exit 1 + fi +done From cac2df45a46022516968c541a5d94db3a84aec12 Mon Sep 17 00:00:00 2001 From: Cloud User Date: Fri, 8 Jun 2018 15:32:35 +0000 Subject: [PATCH 157/174] More robust utt2dur.sh --- .../segmentation/detect_speech_activity.sh | 3 +- egs/wsj/s5/utils/data/get_utt2dur.sh | 33 ++++++++++++++++--- 2 files changed, 30 insertions(+), 6 deletions(-) diff --git a/egs/wsj/s5/steps/segmentation/detect_speech_activity.sh b/egs/wsj/s5/steps/segmentation/detect_speech_activity.sh index a7c1f7d0b0f..bfb74cb475e 100755 --- a/egs/wsj/s5/steps/segmentation/detect_speech_activity.sh +++ b/egs/wsj/s5/steps/segmentation/detect_speech_activity.sh @@ -170,7 +170,8 @@ fi ## Prepare FST we search to make speech/silence decisions. ############################################################################### -frame_shift=$(utils/data/get_frame_shift.sh $test_data_dir) +utils/data/get_utt2dur.sh --nj $nj --cmd "$cmd" $test_data_dir || exit 1 +frame_shift=$(utils/data/get_frame_shift.sh $test_data_dir) || exit 1 graph_dir=${dir}/graph_${output_name} if [ $stage -le 5 ]; then diff --git a/egs/wsj/s5/utils/data/get_utt2dur.sh b/egs/wsj/s5/utils/data/get_utt2dur.sh index 800cac81d18..192a50bc490 100755 --- a/egs/wsj/s5/utils/data/get_utt2dur.sh +++ b/egs/wsj/s5/utils/data/get_utt2dur.sh @@ -13,6 +13,7 @@ frame_shift=0.01 cmd=run.pl nj=4 +permissive=true . utils/parse_options.sh . ./path.sh @@ -39,7 +40,31 @@ fi if [ -s $data/segments ]; then echo "$0: working out $data/utt2dur from $data/segments" - cat $data/segments | awk '{len=$4-$3; print $1, len;}' > $data/utt2dur + cat $data/segments | awk '{if ($4 != -1) { len=$4-$3; print $1, len;} else { print $1, "LENGTH_NOT_FOUND"; } }' > $data/utt2dur + + if [ $(grep LENGTH_NOT_FOUND $data/utt2dur | wc -l) -ne 0 ]; then + utils/data/get_reco2dur.sh --cmd "$cmd" $data + + cat $data/segments | python3 -c "import sys +reco2dur = {} +for line in open('$data/reco2dur').readlines(): + parts = line.strip().split() + reco2dur[parts[0]] = float(parts[1]) + +for line in sys.stdin.readlines(): + parts = line.strip().split() + st = float(parts[2]) + end = float(parts[3]) + if end == -1: + if parts[1] not in reco2dur: + print ('Could not find reco {} in $data/reco2dur'.format(parts[1]), + file=sys.stderr) + sys.exit(1) + len = reco2dur[parts[1]] - st + else: + len = end - st + print ('{} {}'.format(parts[0], len))" > $data/utt2dur || exit 1 + fi elif [ -f $data/wav.scp ]; then echo "$0: segments file does not exist so getting durations from wave files" @@ -75,14 +100,13 @@ elif [ -f $data/wav.scp ]; then fi read_entire_file=false - if cat $data/wav.scp | grep -q 'sox.*speed'; then + if [ $(utils/data/internal/should_read_entire_wavefile.pl $data/wav.scp) == "true" ]; then read_entire_file=true echo "$0: reading from the entire wav file to fix the problem caused by sox commands with speed perturbation. It is going to be slow." echo "... It is much faster if you call get_utt2dur.sh *before* doing the speed perturbation via e.g. perturb_data_dir_speed.sh or " echo "... perturb_data_dir_speed_3way.sh." fi - num_utts=$(wc -l <$data/utt2spk) if [ $nj -gt $num_utts ]; then nj=$num_utts @@ -93,8 +117,7 @@ elif [ -f $data/wav.scp ]; then $cmd JOB=1:$nj $data/log/get_durations.JOB.log \ wav-to-duration --read-entire-file=$read_entire_file \ - scp:$sdata/JOB/wav.scp ark,t:$sdata/JOB/utt2dur || \ - { echo "$0: there was a problem getting the durations"; exit 1; } + scp,p:$sdata/JOB/wav.scp ark,t:$sdata/JOB/utt2dur || exit 1 for n in `seq $nj`; do cat $sdata/$n/utt2dur From 316a1c83ead6232cd342653a94224c591d1ca419 Mon Sep 17 00:00:00 2001 From: Cloud User Date: Fri, 8 Jun 2018 15:34:48 +0000 Subject: [PATCH 158/174] Check if entire wavefile should be read --- .../utils/data/internal/should_read_entire_wavefile.pl | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100755 egs/wsj/s5/utils/data/internal/should_read_entire_wavefile.pl diff --git a/egs/wsj/s5/utils/data/internal/should_read_entire_wavefile.pl b/egs/wsj/s5/utils/data/internal/should_read_entire_wavefile.pl new file mode 100755 index 00000000000..04d2dd8b619 --- /dev/null +++ b/egs/wsj/s5/utils/data/internal/should_read_entire_wavefile.pl @@ -0,0 +1,10 @@ +#!/bin/perl + +while (<>) { + if (m/sox.*speed/ || m/sox.*trim/) { + print "true"; + exit(0); + } +} + +print "false"; From d649528bc759c404014efec4736add28df660099 Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Sun, 24 Jun 2018 19:51:07 -0400 Subject: [PATCH 159/174] Add combine_queue_opt --- egs/wsj/s5/steps/libs/nnet3/train/common.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/egs/wsj/s5/steps/libs/nnet3/train/common.py b/egs/wsj/s5/steps/libs/nnet3/train/common.py index 0219662002d..4618b81f482 100644 --- a/egs/wsj/s5/steps/libs/nnet3/train/common.py +++ b/egs/wsj/s5/steps/libs/nnet3/train/common.py @@ -946,6 +946,9 @@ def __init__(self, self.parser.add_argument("--egs.cmd", type=str, dest="egs_command", action=common_lib.NullstrToNoneAction, help="Script to launch egs jobs") + self.parser.add_argument("--combine-queue-opt", type=str, dest='combine_queue_opt', + default="", + help="Script to launch egs jobs") self.parser.add_argument("--use-gpu", type=str, choices=["true", "false", "yes", "no", "wait"], help="Use GPU for training. " From ec21ebce278c84677fe3dd29919398c013284888 Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Sun, 24 Jun 2018 19:52:22 -0400 Subject: [PATCH 160/174] combine_queue_opt in train.py --- egs/wsj/s5/steps/nnet3/chain/train.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/egs/wsj/s5/steps/nnet3/chain/train.py b/egs/wsj/s5/steps/nnet3/chain/train.py index 5b6d4e84af2..d5cffa72241 100755 --- a/egs/wsj/s5/steps/nnet3/chain/train.py +++ b/egs/wsj/s5/steps/nnet3/chain/train.py @@ -288,7 +288,7 @@ def process_args(args): run_opts.train_queue_opt = "--gpu 1" run_opts.parallel_train_opts = "--use-gpu={}".format(args.use_gpu) - run_opts.combine_queue_opt = "--gpu 1" + run_opts.combine_queue_opt = "--gpu 1" + " " + args.combine_queue_opt run_opts.combine_gpu_opt = "--use-gpu={}".format(args.use_gpu) else: @@ -297,9 +297,10 @@ def process_args(args): run_opts.train_queue_opt = "" run_opts.parallel_train_opts = "--use-gpu=no" - run_opts.combine_queue_opt = "" + run_opts.combine_queue_opt = args.combine_queue_opt run_opts.combine_gpu_opt = "--use-gpu=no" + run_opts.command = args.command run_opts.egs_command = (args.egs_command if args.egs_command is not None else From 014a54ddc80b5be8426a4609bd0bdd40eb8de2e6 Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Wed, 27 Jun 2018 14:46:28 -0400 Subject: [PATCH 161/174] Moving functions and cleaning up --- src/chain/chain-supervision-splitter.cc | 72 ++++++++++ src/chain/chain-supervision-splitter.h | 8 +- src/chain/chain-supervision.cc | 13 +- src/chainbin/Makefile | 3 +- src/chainbin/chain-lattice-to-post.cc | 91 ++---------- src/chainbin/nnet3-chain-split-and-get-egs.cc | 130 +++++++----------- 6 files changed, 151 insertions(+), 166 deletions(-) diff --git a/src/chain/chain-supervision-splitter.cc b/src/chain/chain-supervision-splitter.cc index a1c9597ca64..98c40eb8851 100644 --- a/src/chain/chain-supervision-splitter.cc +++ b/src/chain/chain-supervision-splitter.cc @@ -58,6 +58,78 @@ void FstToLattice(const fst::StdVectorFst &fst, Lattice *lat) { } } +/** This function converts lattice to FSA with weight equal to + sum of acoustic and language score, and pdf_id + 1 as labels. + This assumes that the acoustic and language scores are scaled appropriately. +*/ +void ConvertLatticeToPdfLabels( + const TransitionModel &tmodel, + const Lattice &ifst, + fst::StdVectorFst *ofst) { + typedef fst::ArcTpl ArcIn; + typedef fst::StdArc ArcOut; + typedef ArcIn::StateId StateId; + ofst->DeleteStates(); + // The states will be numbered exactly the same as the original FST. + // Add the states to the new FST. + StateId num_states = ifst.NumStates(); + for (StateId s = 0; s < num_states; s++) + ofst->AddState(); + ofst->SetStart(ifst.Start()); + for (StateId s = 0; s < num_states; s++) { + LatticeWeight final_iweight = ifst.Final(s); + if (final_iweight != LatticeWeight::Zero()) { + fst::TropicalWeight final_oweight; + ConvertLatticeWeight(final_iweight, &final_oweight); + ofst->SetFinal(s, final_oweight); + } + for (fst::ArcIterator iter(ifst, s); + !iter.Done(); + iter.Next()) { + const ArcIn &arc = iter.Value(); + KALDI_PARANOID_ASSERT(arc.weight != LatticeWeight::Zero()); + ArcOut oarc; + ConvertLatticeWeight(arc.weight, &oarc.weight); + if (arc.ilabel == 0) + oarc.ilabel = 0; // epsilon arc + else + oarc.ilabel = tmodel.TransitionIdToPdf(arc.ilabel) + 1; // pdf + 1 + oarc.olabel = oarc.ilabel; + oarc.nextstate = arc.nextstate; + ofst->AddArc(s, oarc); + } + } +} + +bool LatticeToNumeratorPost(const Lattice &lat, + const TransitionModel &trans_model, + const fst::StdVectorFst &fst, + Posterior *post, std::string key) { + fst::StdVectorFst sup_fst; + ConvertLatticeToPdfLabels(trans_model, lat, &sup_fst); + + if (!AddWeightToFst(fst, &sup_fst)) { + if (!key.empty()) + KALDI_WARN << "For key " << key << ", "; + KALDI_WARN << "FST was empty after composing with FST. " + << "This should be extremely rare (a few per corpus, at most)"; + return false; + } + + // Convert fst to lattice to extract posterior using forward backward. + Lattice lat_copy; + ConvertFstToLattice(sup_fst, &lat_copy); + + kaldi::uint64 props = lat_copy.Properties(fst::kFstProperties, false); + if (!(props & fst::kTopSorted)) { + if (fst::TopSort(&lat_copy) == false) + KALDI_ERR << "Cycles detected in lattice."; + } + + LatticeForwardBackward(lat_copy, post); + return true; +} + SupervisionLatticeSplitter::SupervisionLatticeSplitter( const SupervisionLatticeSplitterOptions &opts, const SupervisionOptions &sup_opts, diff --git a/src/chain/chain-supervision-splitter.h b/src/chain/chain-supervision-splitter.h index de11c948e38..8d59f01a97f 100644 --- a/src/chain/chain-supervision-splitter.h +++ b/src/chain/chain-supervision-splitter.h @@ -175,7 +175,7 @@ class SupervisionLatticeSplitter { // This will be computed when PrepareLattice function is called. LatticeInfo lat_scores_; }; - + void GetToleranceEnforcerFst(const SupervisionOptions &opts, const TransitionModel &trans_model, fst::StdVectorFst *tolerance_fst); bool PhoneLatticeToSupervision(const fst::StdVectorFst &tolerance_fst, @@ -184,6 +184,12 @@ bool PhoneLatticeToSupervision(const fst::StdVectorFst &tolerance_fst, chain::Supervision *supervision, bool debug = false); +bool LatticeToNumeratorPost(const Lattice &lat, + const TransitionModel &trans_model, + const fst::StdVectorFst &fst, + Posterior *post, + std::string key = ""); + } } diff --git a/src/chain/chain-supervision.cc b/src/chain/chain-supervision.cc index 9bffe8760e9..02f428ccc23 100644 --- a/src/chain/chain-supervision.cc +++ b/src/chain/chain-supervision.cc @@ -645,6 +645,15 @@ void Supervision::Read(std::istream &is, bool binary) { ExpectToken(is, binary, ""); ReadBasicType(is, binary, &e2e); if (!e2e) { + if (PeekToken(is, binary) == 'N') { + ExpectToken(is, binary, ""); + numerator_post_targets.Read(is, binary); + if (PeekToken(is, binary) == 'N') { + ExpectToken(is, binary, ""); + BaseFloat temp; + ReadBasicType(is, binary, &temp); + } + } if (!binary) { ReadFstKaldi(is, binary, &fst); } else { @@ -904,8 +913,10 @@ bool AddWeightToFst(const fst::StdVectorFst &normalization_fst, fst::StdVectorFst composed_fst; fst::Compose(supervision_fst_noeps, normalization_fst, &composed_fst); - if (composed_fst.NumStates() == 0) + if (composed_fst.NumStates() == 0) { + KALDI_WARN << "FST empty after composing with normalization FST."; return false; + } // projection should not be necessary, as both FSTs are acceptors. // determinize and minimize to make it as compact as possible. diff --git a/src/chainbin/Makefile b/src/chainbin/Makefile index a8626dae2c7..6efc10eadf3 100644 --- a/src/chainbin/Makefile +++ b/src/chainbin/Makefile @@ -13,7 +13,8 @@ BINFILES = chain-est-phone-lm chain-get-supervision chain-make-den-fst \ nnet3-chain-combine nnet3-chain-normalize-egs \ nnet3-chain-e2e-get-egs nnet3-chain-compute-post \ nnet3-chain-split-and-get-egs chain-split-lattices \ - nnet3-chain-split-convert-and-get-egs chain-lattice-to-post + nnet3-chain-split-convert-and-get-egs \ + chain-lattice-to-post chain-fst-to-post OBJFILES = diff --git a/src/chainbin/chain-lattice-to-post.cc b/src/chainbin/chain-lattice-to-post.cc index 561014cf424..d07dd8fef1f 100644 --- a/src/chainbin/chain-lattice-to-post.cc +++ b/src/chainbin/chain-lattice-to-post.cc @@ -25,82 +25,7 @@ #include "hmm/posterior.h" #include "lat/lattice-functions.h" #include "chain/chain-supervision.h" - -namespace kaldi { -namespace chain { - -/** This function converts lattice to FSA with weight equal to - sum of acoustic and language score, and pdf_id + 1 as labels. - This assumes that the acoustic and language scores are scaled appropriately. -*/ -void ConvertLatticeToPdfLabels( - const TransitionModel &tmodel, - const Lattice &ifst, - fst::StdVectorFst *ofst) { - typedef fst::ArcTpl ArcIn; - typedef fst::StdArc ArcOut; - typedef ArcIn::StateId StateId; - ofst->DeleteStates(); - // The states will be numbered exactly the same as the original FST. - // Add the states to the new FST. - StateId num_states = ifst.NumStates(); - for (StateId s = 0; s < num_states; s++) - ofst->AddState(); - ofst->SetStart(ifst.Start()); - for (StateId s = 0; s < num_states; s++) { - LatticeWeight final_iweight = ifst.Final(s); - if (final_iweight != LatticeWeight::Zero()) { - fst::TropicalWeight final_oweight; - ConvertLatticeWeight(final_iweight, &final_oweight); - ofst->SetFinal(s, final_oweight); - } - for (fst::ArcIterator iter(ifst, s); - !iter.Done(); - iter.Next()) { - const ArcIn &arc = iter.Value(); - KALDI_PARANOID_ASSERT(arc.weight != LatticeWeight::Zero()); - ArcOut oarc; - ConvertLatticeWeight(arc.weight, &oarc.weight); - if (arc.ilabel == 0) - oarc.ilabel = 0; // epsilon arc - else - oarc.ilabel = tmodel.TransitionIdToPdf(arc.ilabel) + 1; // pdf + 1 - oarc.olabel = oarc.ilabel; - oarc.nextstate = arc.nextstate; - ofst->AddArc(s, oarc); - } - } -} - -void LatticeToNumeratorPost(const Lattice &lat, - const TransitionModel &trans_model, - const fst::StdVectorFst &fst, - BaseFloat lm_scale, std::string key, - Posterior *post) { - fst::StdVectorFst sup_fst; - ConvertLatticeToPdfLabels(trans_model, lat, &sup_fst); - - if (!AddWeightToFst(fst, &sup_fst)) { - KALDI_WARN << "For utterance " << key << ", feature frames " - << ", FST was empty after composing with normalization FST. " - << "This should be extremely rare (a few per corpus, at most)"; - } - - // Convert fst to lattice to extract posterior using forward backward. - Lattice lat_copy; - ConvertFstToLattice(sup_fst, &lat_copy); - - kaldi::uint64 props = lat_copy.Properties(fst::kFstProperties, false); - if (!(props & fst::kTopSorted)) { - if (fst::TopSort(&lat_copy) == false) - KALDI_ERR << "Cycles detected in lattice."; - } - - LatticeForwardBackward(lat_copy, post); -} - -} // namespace chain -} // namespace kaldi +#include "chain/chain-supervision-splitter.h" int main(int argc, char *argv[]) { @@ -174,7 +99,7 @@ int main(int argc, char *argv[]) { SequentialLatticeReader lattice_reader(lattice_rspecifier); PosteriorWriter posterior_writer(post_wspecifier); - int32 num_done = 0; + int32 num_done = 0, num_fail = 0; for (; !lattice_reader.Done(); lattice_reader.Next()) { std::string key = lattice_reader.Key(); @@ -183,15 +108,19 @@ int main(int argc, char *argv[]) { fst::ScaleLattice(fst::LatticeScale(1.0 - fst_scale, acoustic_scale), &lat); Posterior graph_post; - LatticeToNumeratorPost( - lat, trans_model, fst, - 1.0 - fst_scale , key, &graph_post); + bool status = LatticeToNumeratorPost(lat, trans_model, fst, + &graph_post, key); + if (!status) { + num_fail++; + continue; + } posterior_writer.Write(key, graph_post); num_done++; } - KALDI_LOG << "Converted " << num_done << " lattices to posteriors"; + KALDI_LOG << "Converted " << num_done << " lattices to posteriors; " + << "failed for " << num_fail; return num_done > 0 ? 0 : 1; } catch(const std::exception &e) { diff --git a/src/chainbin/nnet3-chain-split-and-get-egs.cc b/src/chainbin/nnet3-chain-split-and-get-egs.cc index 4e1a22c6f7c..86c1b1868a0 100644 --- a/src/chainbin/nnet3-chain-split-and-get-egs.cc +++ b/src/chainbin/nnet3-chain-split-and-get-egs.cc @@ -32,81 +32,6 @@ namespace kaldi { namespace nnet3 { -/** This function converts lattice to FSA with weight equal to - sum of acoustic and language score, and pdf_id + 1 as labels. - This assumes that the acoustic and language scores are scaled appropriately. -*/ -void ConvertLatticeToPdfLabels( - const TransitionModel &tmodel, - const Lattice &ifst, - fst::StdVectorFst *ofst) { - typedef fst::ArcTpl ArcIn; - typedef fst::StdArc ArcOut; - typedef ArcIn::StateId StateId; - ofst->DeleteStates(); - // The states will be numbered exactly the same as the original FST. - // Add the states to the new FST. - StateId num_states = ifst.NumStates(); - for (StateId s = 0; s < num_states; s++) - ofst->AddState(); - ofst->SetStart(ifst.Start()); - for (StateId s = 0; s < num_states; s++) { - LatticeWeight final_iweight = ifst.Final(s); - if (final_iweight != LatticeWeight::Zero()) { - fst::TropicalWeight final_oweight; - ConvertLatticeWeight(final_iweight, &final_oweight); - ofst->SetFinal(s, final_oweight); - } - for (fst::ArcIterator iter(ifst, s); - !iter.Done(); - iter.Next()) { - const ArcIn &arc = iter.Value(); - KALDI_PARANOID_ASSERT(arc.weight != LatticeWeight::Zero()); - ArcOut oarc; - ConvertLatticeWeight(arc.weight, &oarc.weight); - if (arc.ilabel == 0) - oarc.ilabel = 0; // epsilon arc - else - oarc.ilabel = tmodel.TransitionIdToPdf(arc.ilabel) + 1; // pdf + 1 - oarc.olabel = oarc.ilabel; - oarc.nextstate = arc.nextstate; - ofst->AddArc(s, oarc); - } - } -} - -void LatticeToNumeratorPost(const Lattice &lat, - const TransitionModel &trans_model, - const fst::StdVectorFst &normalization_fst, - BaseFloat lm_scale, std::string key, - Posterior *post) { - Lattice lat_copy(lat); - - if (normalization_fst.NumStates() > 0) - fst::ScaleLattice(fst::GraphLatticeScale(lm_scale), &lat_copy); - - fst::StdVectorFst sup_fst; - ConvertLatticeToPdfLabels(trans_model, lat_copy, &sup_fst); - - if (normalization_fst.NumStates() > 0 && - !chain::AddWeightToFst(normalization_fst, &sup_fst)) { - KALDI_WARN << "For utterance " << key << ", feature frames " - << ", FST was empty after composing with normalization FST. " - << "This should be extremely rare (a few per corpus, at most)"; - } - - // Convert fst to lattice to extract posterior using forward backward. - ConvertFstToLattice(sup_fst, &lat_copy); - - kaldi::uint64 props = lat_copy.Properties(fst::kFstProperties, false); - if (!(props & fst::kTopSorted)) { - if (fst::TopSort(&lat_copy) == false) - KALDI_ERR << "Cycles detected in lattice."; - } - - LatticeForwardBackward(lat_copy, post); -} - /** This function does all the processing for one utterance, and outputs the supervision objects to 'example_writer'. Note: if normalization_fst is the @@ -127,7 +52,8 @@ static bool ProcessFile(const chain::SupervisionOptions &sup_opts, const std::string &utt_id, bool compress, UtteranceSplitter *utt_splitter, - NnetChainExampleWriter *example_writer) { + NnetChainExampleWriter *example_writer, + bool add_numerator_post = false) { int32 num_input_frames = feats.NumRows(); @@ -167,13 +93,46 @@ static bool ProcessFile(const chain::SupervisionOptions &sup_opts, chain::Supervision supervision_part; + Lattice *lat_part = NULL; + + if (add_numerator_post) + lat_part = new Lattice(); if (!sup_lat_splitter.GetFrameRangeSupervision(start_frame_subsampled, num_frames_subsampled, - &supervision_part)) - return false; + &supervision_part, + NULL, lat_part)) { + delete lat_part; + continue; + } + + if (add_numerator_post) { + Posterior post_part; + if (!chain::LatticeToNumeratorPost(*lat_part, trans_model, + normalization_fst, &post_part)) { + delete lat_part; + continue; + } + KALDI_ASSERT(post_part.size() == num_frames_subsampled); + + Posterior labels(num_frames_subsampled); + + for (int32 i = 0; i < num_frames_subsampled; i++) { + for (int32 j = 0; j < post_part[i].size(); j++) { + BaseFloat post = post_part[i][j].second; + KALDI_ASSERT(post_part[i][j].first > 0); + if (post > min_post) { + labels[i].push_back(std::make_pair( + post_part[i][j].first - 1, post)); // Convert from 1-index to 0-index + } + } + } + + SparseMatrix smat(trans_model.NumPdfs(), labels); + supervision_part.numerator_post_targets = smat; - if (graph_posteriors) { + delete lat_part; + } else if (graph_posteriors) { Posterior labels; labels.resize(num_frames_subsampled); for (int32 i = 0; i < num_frames_subsampled; i++) { @@ -316,6 +275,7 @@ int main(int argc, char *argv[]) { graph_posterior_rspecifier; BaseFloat min_post = 1e-8; + bool add_numerator_post = false; ParseOptions po(usage); po.Register("compress", &compress, "If true, write egs with input features " @@ -344,6 +304,9 @@ int main(int argc, char *argv[]) { "Pdf posteriors where the labels are 1-indexed"); po.Register("min-post", &min_post, "Minimum posterior to keep; this will " "avoid dumping out all posteriors."); + po.Register("add-numerator-post", &add_numerator_post, + "Add numerator post to supervision; this is alternative to " + "graph-posterior-rspecifier"); eg_config.Register(&po); @@ -387,11 +350,14 @@ int main(int argc, char *argv[]) { eg_config.ComputeDerived(); UtteranceSplitter utt_splitter(eg_config); + if (add_numerator_post) + KALDI_ASSERT(!normalization_fst_rxfilename.empty()); + fst::StdVectorFst normalization_fst; if (!normalization_fst_rxfilename.empty()) { ReadFstKaldi(normalization_fst_rxfilename, &normalization_fst); KALDI_ASSERT(normalization_fst.NumStates() > 0); - + if (sup_opts.lm_scale < 0.0 || sup_opts.lm_scale > 1.0) { KALDI_ERR << "Invalid lm-scale; must be in [0.0, 1.0)"; } @@ -459,7 +425,7 @@ int main(int argc, char *argv[]) { num_err++; continue; } - + const Vector *deriv_weights = NULL; if (!deriv_weights_rspecifier.empty()) { if (!deriv_weights_reader.HasKey(key)) { @@ -494,7 +460,7 @@ int main(int argc, char *argv[]) { deriv_weights, graph_posteriors, min_post, supervision_length_tolerance, key, compress, - &utt_splitter, &example_writer)) + &utt_splitter, &example_writer, add_numerator_post)) num_err++; } } From 1a77f84078a70c03ebb4791ec118c99f7f652e4b Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Wed, 27 Jun 2018 14:49:47 -0400 Subject: [PATCH 162/174] Fixes to aspire related scripts --- .../s5/local/generate_uniformly_segmented_data_dir.sh | 2 +- egs/wsj/s5/steps/data/reverberate_data_dir.py | 8 +------- egs/wsj/s5/utils/data/get_reco2dur.sh | 9 ++++----- 3 files changed, 6 insertions(+), 13 deletions(-) diff --git a/egs/aspire/s5/local/generate_uniformly_segmented_data_dir.sh b/egs/aspire/s5/local/generate_uniformly_segmented_data_dir.sh index db1c04e252d..0af89e46105 100755 --- a/egs/aspire/s5/local/generate_uniformly_segmented_data_dir.sh +++ b/egs/aspire/s5/local/generate_uniformly_segmented_data_dir.sh @@ -31,7 +31,7 @@ fi data_set=$1 segmented_data_set=$2 -if [ "$data_set" =~ "dev_aspire" ]; then +if [[ "$data_set" =~ "dev_aspire" ]]; then if [ $stage -le 1 ]; then echo "$0: Creating the data dir with whole recordings without segmentation" # create a whole directory without the segments diff --git a/egs/wsj/s5/steps/data/reverberate_data_dir.py b/egs/wsj/s5/steps/data/reverberate_data_dir.py index 71e64d9e680..f6be7a286ec 100755 --- a/egs/wsj/s5/steps/data/reverberate_data_dir.py +++ b/egs/wsj/s5/steps/data/reverberate_data_dir.py @@ -413,13 +413,7 @@ def CreateReverberatedCopy(input_dir, wav_scp = ParseFileToDict(input_dir + "/wav.scp", value_processor = lambda x: " ".join(x)) if not os.path.isfile(input_dir + "/reco2dur"): print("Getting the duration of the recordings..."); - read_entire_file="false" - for value in wav_scp.values(): - # we will add more checks for sox commands which modify the header as we come across these cases in our data - if "sox" in value and "speed" in value: - read_entire_file="true" - break - data_lib.RunKaldiCommand("wav-to-duration --read-entire-file={1} scp:{0}/wav.scp ark,t:{0}/reco2dur".format(input_dir, read_entire_file)) + data_lib.RunKaldiCommand("utils/data/get_reco2dur.sh {}".format(input_dir)) durations = ParseFileToDict(input_dir + "/reco2dur", value_processor = lambda x: float(x[0])) foreground_snr_array = map(lambda x: float(x), foreground_snr_string.split(':')) background_snr_array = map(lambda x: float(x), background_snr_string.split(':')) diff --git a/egs/wsj/s5/utils/data/get_reco2dur.sh b/egs/wsj/s5/utils/data/get_reco2dur.sh index 4b45c6f00a3..8f9e27d8094 100755 --- a/egs/wsj/s5/utils/data/get_reco2dur.sh +++ b/egs/wsj/s5/utils/data/get_reco2dur.sh @@ -51,7 +51,7 @@ if [ -s $data/utt2dur ] && \ [ ! -s $data/segments ]; then echo "$0: $data/wav.scp indexed by utt-id; copying utt2dur to reco2dur" - cp $data/utt2dur $data/reco2utt && exit 0; + cp $data/utt2dur $data/reco2dur && exit 0; elif [ -f $data/wav.scp ]; then echo "$0: obtaining durations from recordings" @@ -88,7 +88,7 @@ elif [ -f $data/wav.scp ]; then fi read_entire_file=false - if grep -q 'sox.*speed' $data/wav.scp; then + if [ $(utils/data/internal/should_read_entire_wavefile.pl $data/wav.scp) == "true" ]; then read_entire_file=true echo "$0: reading from the entire wav file to fix the problem caused by sox commands with speed perturbation. It is going to be slow." echo "... It is much faster if you call get_reco2dur.sh *before* doing the speed perturbation via e.g. perturb_data_dir_speed.sh or " @@ -111,12 +111,11 @@ elif [ -f $data/wav.scp ]; then fi utils/split_scp.pl $data/wav.scp $wavscps - + $cmd JOB=1:$nj $data/log/get_reco_durations.JOB.log \ wav-to-duration --read-entire-file=$read_entire_file \ - scp:$temp_data_dir/JOB/wav.scp ark,t:$temp_data_dir/JOB/reco2dur || \ - { echo "$0: there was a problem getting the durations"; exit 1; } # This could + scp,p:$temp_data_dir/JOB/wav.scp ark,t:$temp_data_dir/JOB/reco2dur || exit 1 for n in `seq $nj`; do cat $temp_data_dir/$n/reco2dur From dc69be0ffaf92de4ac4a373bcf0adf5a4dcb9bbb Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Wed, 27 Jun 2018 14:50:30 -0400 Subject: [PATCH 163/174] numerator post in get_egs_split.sh --- egs/wsj/s5/steps/nnet3/chain/get_egs_split.sh | 66 +++++++++++++------ .../nnet3/chain/make_weighted_den_fst.sh | 7 +- 2 files changed, 51 insertions(+), 22 deletions(-) diff --git a/egs/wsj/s5/steps/nnet3/chain/get_egs_split.sh b/egs/wsj/s5/steps/nnet3/chain/get_egs_split.sh index 5d06b3acb5b..4a1f9d7c982 100755 --- a/egs/wsj/s5/steps/nnet3/chain/get_egs_split.sh +++ b/egs/wsj/s5/steps/nnet3/chain/get_egs_split.sh @@ -55,9 +55,13 @@ left_tolerance= right_tolerance_silence= # Tolerances for silence phones left_tolerance_silence= +add_numerator_post=false + kl_latdir= kl_fst_scale=0.5 +graph_posterior_rspecifier= + stage=0 max_jobs_run=15 # This should be set to the maximum number of nnet3-chain-get-egs jobs you are # comfortable to run in parallel; you can increase it if your disk @@ -282,6 +286,8 @@ chain_supervision_all_opts="--supervision.frame-subsampling-factor=$alignment_su chain_supervision_all_opts="$chain_supervision_all_opts --supervision.left-tolerance=$left_tolerance" +chain_supervision_all_opts="$chain_supervision_all_opts --add-numerator-post=$add_numerator_post" + normalization_fst_scale=1.0 lats_rspecifier="ark,s,cs:gunzip -c $latdir/lat.JOB.gz |" @@ -328,20 +334,21 @@ echo $right_context > $dir/info/right_context echo $left_context_initial > $dir/info/left_context_initial echo $right_context_final > $dir/info/right_context_final -graph_posterior_rspecifier= -if [ ! -z "$kl_latdir" ]; then - if [ $stage -le 1 ]; then - steps/nnet3/chain/get_chain_graph_post.sh \ - --cmd "$cmd" --fst-scale $kl_fst_scale --acwt $acwt \ - $chaindir $kl_latdir $dir || exit 1 - fi +if [ -z "$graph_posterior_rspecifier" ]; then + if [ ! -z "$kl_latdir" ]; then + if [ $stage -le 1 ]; then + steps/nnet3/chain/get_chain_graph_post.sh \ + --cmd "$cmd" --fst-scale $kl_fst_scale --acwt $acwt \ + $chaindir $kl_latdir $dir || exit 1 + fi - if [ ! -s "$dir/numerator_post.scp" ]; then - echo "$0: Could not find $dir/numerator_post.scp. Something went wrong." - exit 1 - fi + if [ ! -s "$dir/numerator_post.scp" ]; then + echo "$0: Could not find $dir/numerator_post.scp. Something went wrong." + exit 1 + fi - graph_posterior_rspecifier="scp:$dir/numerator_post.scp" + graph_posterior_rspecifier="scp:$dir/numerator_post.scp" + fi fi if [ $stage -le 2 ]; then @@ -422,6 +429,11 @@ if [ $stage -le 4 ]; then done echo "$0: Generating training examples on disk" + normalization_fst_maybe= + if $add_numerator_post; then + normalization_fst_maybe=$chaindir/normalization.fst + fi + # The examples will go round-robin to egs_list. Note: we omit the # 'normalization.fst' argument while creating temporary egs: the phase of egs # preparation that involves the normalization FST is quite CPU-intensive and @@ -438,7 +450,7 @@ if [ $stage -le 4 ]; then $ivector_opts --srand=\$[JOB+$srand] $egs_opts \ --num-frames-overlap=$frames_overlap_per_eg \ ${graph_posterior_rspecifier:+--graph-posterior-rspecifier="$graph_posterior_rspecifier"} \ - "$feats" $chaindir/tree $chaindir/0.trans_mdl \ + $normalization_fst_maybe "$feats" $chaindir/tree $chaindir/0.trans_mdl \ ark,s,cs:- ark:- \| \ nnet3-chain-copy-egs --random=true --srand=\$[JOB+$srand] ark:- $egs_list || exit 1; fi @@ -460,10 +472,17 @@ if [ $stage -le 5 ]; then else output_archive="ark:$dir/cegs.JOB.ark" fi - $cmd --max-jobs-run $max_shuffle_jobs_run --mem 8G JOB=1:$num_archives_intermediate $dir/log/shuffle.JOB.log \ - nnet3-chain-normalize-egs --normalization-fst-scale=$normalization_fst_scale $chaindir/normalization.fst "ark:cat $egs_list|" ark:- \| \ - nnet3-chain-shuffle-egs --srand=\$[JOB+$srand] ark:- $output_archive || exit 1; + if ! $add_numerator_post; then + $cmd --max-jobs-run $max_shuffle_jobs_run --mem 8G JOB=1:$num_archives_intermediate $dir/log/shuffle.JOB.log \ + nnet3-chain-normalize-egs --normalization-fst-scale=$normalization_fst_scale $chaindir/normalization.fst "ark:cat $egs_list|" ark:- \| \ + nnet3-chain-shuffle-egs --srand=\$[JOB+$srand] ark:- $output_archive || exit 1; + else + $cmd --max-jobs-run $max_shuffle_jobs_run --mem 8G JOB=1:$num_archives_intermediate $dir/log/shuffle.JOB.log \ + nnet3-chain-shuffle-egs --srand=\$[JOB+$srand] "ark:cat $egs_list|" \ + $output_archive || exit 1; + fi + if $generate_egs_scp; then #concatenate cegs.JOB.scp in single cegs.scp rm -rf $dir/cegs.scp @@ -490,11 +509,16 @@ if [ $stage -le 5 ]; then ln -sf cegs.$archive_index.ark $dir/cegs.$x.$y.ark || exit 1 done done - $cmd --max-jobs-run $max_shuffle_jobs_run --mem 8G JOB=1:$num_archives_intermediate $dir/log/shuffle.JOB.log \ - nnet3-chain-normalize-egs --normalization-fst-scale=$normalization_fst_scale $chaindir/normalization.fst "ark:cat $egs_list|" ark:- \| \ - nnet3-chain-shuffle-egs --srand=\$[JOB+$srand] ark:- ark:- \| \ - nnet3-chain-copy-egs ark:- $output_archives || exit 1; - + if ! $add_numerator_post; then + $cmd --max-jobs-run $max_shuffle_jobs_run --mem 8G JOB=1:$num_archives_intermediate $dir/log/shuffle.JOB.log \ + nnet3-chain-normalize-egs --normalization-fst-scale=$normalization_fst_scale $chaindir/normalization.fst "ark:cat $egs_list|" ark:- \| \ + nnet3-chain-shuffle-egs --srand=\$[JOB+$srand] ark:- ark:- \| \ + nnet3-chain-copy-egs ark:- $output_archives || exit 1; + else + $cmd --max-jobs-run $max_shuffle_jobs_run --mem 8G JOB=1:$num_archives_intermediate $dir/log/shuffle.JOB.log \ + nnet3-chain-shuffle-egs --srand=\$[JOB+$srand] "ark:cat $egs_list|" ark:- \| \ + nnet3-chain-copy-egs ark:- $output_archives || exit 1; + fi if $generate_egs_scp; then #concatenate cegs.JOB.scp in single cegs.scp rm -f $dir/cegs.scp diff --git a/egs/wsj/s5/steps/nnet3/chain/make_weighted_den_fst.sh b/egs/wsj/s5/steps/nnet3/chain/make_weighted_den_fst.sh index 45a48c10c91..3b6371168ce 100755 --- a/egs/wsj/s5/steps/nnet3/chain/make_weighted_den_fst.sh +++ b/egs/wsj/s5/steps/nnet3/chain/make_weighted_den_fst.sh @@ -93,7 +93,7 @@ for n in `seq 0 $[num_alignments-1]`; do this_num_repeats=${num_repeats_array[$n]} this_alignment_dir=${ali_dirs[$n]} num_jobs=$(cat $this_alignment_dir/num_jobs) - if ! [ "$this_num_repeats" -gt 0 ]; then + if ! [ "$this_num_repeats" -ge 0 ]; then echo "Expected comma-separated list of integers for --num-repeats option, got '$num_repeats'" exit 1 fi @@ -104,6 +104,11 @@ for n in `seq 0 $[num_alignments-1]`; do ali-to-phones $this_alignment_dir/final.mdl ark:- "ark:|gzip -c >$dir/phones.$n.gz" || exit 1; fi + if [ ! -s $dir/phones.$n.gz ]; then + echo "$dir/phones.$n.gz is empty or does not exist" + exit 1 + fi + all_phones="$all_phones $(for r in $(seq $this_num_repeats); do echo $dir/phones.$n.gz; done)" done From 0bc2b64307fb5b26d5f5ae7f2f26b74a0bd8f259 Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Wed, 27 Jun 2018 14:51:26 -0400 Subject: [PATCH 164/174] Adding train_queue_opt --- egs/wsj/s5/steps/libs/nnet3/train/common.py | 3 +++ egs/wsj/s5/steps/nnet3/chain/train.py | 4 ++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/egs/wsj/s5/steps/libs/nnet3/train/common.py b/egs/wsj/s5/steps/libs/nnet3/train/common.py index 4618b81f482..1adfa8a74e2 100644 --- a/egs/wsj/s5/steps/libs/nnet3/train/common.py +++ b/egs/wsj/s5/steps/libs/nnet3/train/common.py @@ -949,6 +949,9 @@ def __init__(self, self.parser.add_argument("--combine-queue-opt", type=str, dest='combine_queue_opt', default="", help="Script to launch egs jobs") + self.parser.add_argument("--train-queue-opt", type=str, dest='train_queue_opt', + default="", + help="Script to launch egs jobs") self.parser.add_argument("--use-gpu", type=str, choices=["true", "false", "yes", "no", "wait"], help="Use GPU for training. " diff --git a/egs/wsj/s5/steps/nnet3/chain/train.py b/egs/wsj/s5/steps/nnet3/chain/train.py index d5cffa72241..066a193e770 100755 --- a/egs/wsj/s5/steps/nnet3/chain/train.py +++ b/egs/wsj/s5/steps/nnet3/chain/train.py @@ -286,7 +286,7 @@ def process_args(args): If you have GPUs and have nvcc installed, go to src/ and do ./configure; make""") - run_opts.train_queue_opt = "--gpu 1" + run_opts.train_queue_opt = "--gpu 1" + " " + args.train_queue_opt run_opts.parallel_train_opts = "--use-gpu={}".format(args.use_gpu) run_opts.combine_queue_opt = "--gpu 1" + " " + args.combine_queue_opt run_opts.combine_gpu_opt = "--use-gpu={}".format(args.use_gpu) @@ -295,7 +295,7 @@ def process_args(args): logger.warning("Without using a GPU this will be very slow. " "nnet3 does not yet support multiple threads.") - run_opts.train_queue_opt = "" + run_opts.train_queue_opt = args.train_queue_opt run_opts.parallel_train_opts = "--use-gpu=no" run_opts.combine_queue_opt = args.combine_queue_opt run_opts.combine_gpu_opt = "--use-gpu=no" From 0b34eb21a44a9db26f1b7a23e53a48260b75cb02 Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Thu, 28 Jun 2018 15:00:45 -0400 Subject: [PATCH 165/174] Adding semisup ts learning --- egs/aspire/s5/local/semisup/build_silprob.sh | 15 + .../chain/tuning/run_tdnn_lstm_300k_1a.sh | 327 ++++++++++++ .../chain/tuning/run_tdnn_lstm_300k_1b.sh | 355 +++++++++++++ .../tuning/run_tdnn_lstm_300k_kl_ts_1b.sh | 463 +++++++++++++++++ .../tuning/run_tdnn_lstm_300k_kl_ts_ami_1a.sh | 290 +++++++++++ .../tuning/run_tdnn_lstm_300k_kl_ts_ami_1b.sh | 292 +++++++++++ .../run_tdnn_lstm_300k_kl_ts_ami_subset_1a.sh | 420 +++++++++++++++ .../tuning/run_tdnn_lstm_300k_kl_wt_ami_1a.sh | 286 +++++++++++ .../tuning/run_tdnn_lstm_300k_norvb_1a.sh | 288 +++++++++++ .../run_tdnn_lstm_300k_semisup_ts_1a.sh | 370 +++++++++++++ .../run_tdnn_lstm_300k_semisup_ts_1b.sh | 460 +++++++++++++++++ .../run_tdnn_lstm_300k_semisup_ts_1c.sh | 469 +++++++++++++++++ .../run_tdnn_lstm_300k_semisup_ts_1d.sh | 484 ++++++++++++++++++ .../run_tdnn_lstm_300k_semisup_ts_ami_1a.sh | 292 +++++++++++ .../run_tdnn_lstm_300k_semisup_ts_ami_1b.sh | 292 +++++++++++ .../run_tdnn_lstm_300k_semisup_ts_ami_1c.sh | 333 ++++++++++++ .../run_tdnn_lstm_300k_semisup_ts_ami_1d.sh | 331 ++++++++++++ .../run_tdnn_lstm_300k_semisup_ts_ami_1e.sh | 328 ++++++++++++ .../run_tdnn_lstm_300k_semisup_ts_ami_1f.sh | 323 ++++++++++++ .../run_tdnn_lstm_300k_semisup_ts_ami_1g.sh | 323 ++++++++++++ .../run_tdnn_lstm_300k_semisup_ts_ami_1h.sh | 307 +++++++++++ .../run_tdnn_lstm_300k_semisup_ts_ami_1i.sh | 420 +++++++++++++++ .../run_tdnn_lstm_300k_semisup_ts_ami_1j.sh | 421 +++++++++++++++ ...tdnn_lstm_300k_semisup_ts_ami_subset_1a.sh | 424 +++++++++++++++ .../run_tdnn_lstm_300k_semisup_wt_1b.sh | 430 ++++++++++++++++ .../run_tdnn_lstm_300k_semisup_wt_ami_1a.sh | 297 +++++++++++ .../run_tdnn_lstm_300k_wgt_semisup_wt_1b.sh | 430 ++++++++++++++++ egs/aspire/s5/local/semisup/copy_lat_dir.sh | 52 ++ .../nnet3/run_student_ivector_common.sh | 104 ++++ egs/aspire/s5/local/semisup/run_300k.sh | 18 + 30 files changed, 9644 insertions(+) create mode 100755 egs/aspire/s5/local/semisup/build_silprob.sh create mode 100755 egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_1a.sh create mode 100755 egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_1b.sh create mode 100755 egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_kl_ts_1b.sh create mode 100755 egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_kl_ts_ami_1a.sh create mode 100755 egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_kl_ts_ami_1b.sh create mode 100755 egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_kl_ts_ami_subset_1a.sh create mode 100755 egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_kl_wt_ami_1a.sh create mode 100755 egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_norvb_1a.sh create mode 100755 egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_ts_1a.sh create mode 100755 egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_ts_1b.sh create mode 100755 egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_ts_1c.sh create mode 100755 egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_ts_1d.sh create mode 100755 egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_ts_ami_1a.sh create mode 100755 egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_ts_ami_1b.sh create mode 100755 egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_ts_ami_1c.sh create mode 100755 egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_ts_ami_1d.sh create mode 100755 egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_ts_ami_1e.sh create mode 100755 egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_ts_ami_1f.sh create mode 100755 egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_ts_ami_1g.sh create mode 100755 egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_ts_ami_1h.sh create mode 100755 egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_ts_ami_1i.sh create mode 100755 egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_ts_ami_1j.sh create mode 100755 egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_ts_ami_subset_1a.sh create mode 100755 egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_wt_1b.sh create mode 100755 egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_wt_ami_1a.sh create mode 100755 egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_wgt_semisup_wt_1b.sh create mode 100755 egs/aspire/s5/local/semisup/copy_lat_dir.sh create mode 100755 egs/aspire/s5/local/semisup/nnet3/run_student_ivector_common.sh create mode 100644 egs/aspire/s5/local/semisup/run_300k.sh diff --git a/egs/aspire/s5/local/semisup/build_silprob.sh b/egs/aspire/s5/local/semisup/build_silprob.sh new file mode 100755 index 00000000000..c51e3ea05e3 --- /dev/null +++ b/egs/aspire/s5/local/semisup/build_silprob.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +set -e + +. ./cmd.sh +. ./path.sh + +steps/get_prons.sh --cmd "$train_cmd" data/train_300k data/lang exp/semisup300k/tri5b + +utils/dict_dir_add_pronprobs.sh --max-normalize true \ + data/local/dict exp/semisup300k/tri5b/pron_counts_nowb.txt \ + exp/semisup300k/tri5b/sil_counts_nowb.txt \ + exp/semisup300k/tri5b/pron_bigram_counts_nowb.txt data/local/dict_300k_pp + +utils/prepare_lang.sh data/local/dict_300k_pp "" data/local/lang_300k_pp data/lang_300k_pp diff --git a/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_1a.sh b/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_1a.sh new file mode 100755 index 00000000000..cd0abc6792a --- /dev/null +++ b/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_1a.sh @@ -0,0 +1,327 @@ +#!/bin/bash + +set -e + +# based on run_tdnn_7b.sh in the swbd recipe + +# configs for 'chain' +affix=v8 + +stage=0 +train_stage=-10 +get_egs_stage=-10 +test_stage=1 +nj=70 + +train_set=train_300k +exp=exp/semisup300k +gmm=tri5a + +tdnn_affix=_1a +tree_affix=bi_a +chain_affix= + +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +# training options +num_epochs=4 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +label_delay=5 + +# decode options +extra_left_context=50 +extra_right_context=0 +decode_iter= + +remove_egs=false +common_egs_dir= + +num_data_reps=3 +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat < $dir/uttlist + + utils/subset_data_dir.sh --utt-list $dir/uttlist data/train_rvb_hires \ + data/${rvb_train_set}_hires + + utils/subset_data_dir.sh --utt-list $dir/uttlist data/train_rvb \ + data/${rvb_train_set} +fi + +norvb_lat_dir=${exp}/chain${chain_affix}/${gmm}_train_lats + +if [ $stage -le 8 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 30 --cmd "$train_cmd" \ + --generate-ali-from-lats true data/$train_set \ + data/lang $gmm_dir $norvb_lat_dir || exit 1; + rm $norvb_lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 9 ]; then + mkdir -p $lat_dir + + utils/split_data.sh data/${rvb_train_set} $nj + + for n in `seq $nj`; do + awk '{print $1}' data/${rvb_train_set}/split$nj/$n/utt2spk | \ + perl -ane 's/rev[1-3]_//g' > $lat_dir/uttlist.$n.$nj + done + + rm -f $lat_dir/lat_tmp.*.{ark,scp} 2>/dev/null + + norvb_nj=$(cat $norvb_lat_dir/num_jobs) + $train_cmd JOB=1:$norvb_nj $lat_dir/log/copy_lattices.JOB.log \ + lattice-copy "ark:gunzip -c $norvb_lat_dir/lat.JOB.gz |" \ + ark,scp:$lat_dir/lat_tmp.JOB.ark,$lat_dir/lat_tmp.JOB.scp || exit 1 + + for n in `seq 3`; do + cat $lat_dir/lat_tmp.*.scp | awk -v n=$n '{print "rev"n"_"$1" "$2}' + done > $lat_dir/lat_rvb.scp + + $train_cmd JOB=1:$nj $lat_dir/log/copy_rvb_lattices.JOB.log \ + lattice-copy \ + "scp:utils/filter_scp.pl data/${rvb_train_set}/split$nj/JOB/utt2spk $lat_dir/lat_rvb.scp |" \ + "ark:| gzip -c > $lat_dir/lat.JOB.gz" || exit 1 + + rm $lat_dir/lat_tmp.* $lat_dir/lat_rvb.scp + + echo $nj > $lat_dir/num_jobs + + for f in cmvn_opts final.mdl splice_opts tree; do + cp $norvb_lat_dir/$f $lat_dir/$f + done +fi + +if [ $stage -le 10 ]; then + # Create a version of the lang/ directory that has one state per phone in the + # topo file. [note, it really has two states.. the first one is only repeated + # once, the second one has zero or more repeats.] + rm -rf $lang + cp -r data/lang $lang + silphonelist=$(cat $lang/phones/silence.csl) || exit 1; + nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1; + # Use our special topology... note that later on may have to tune this + # topology. + steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + # we build the tree using clean features (data/train) rather than + # the augmented features (data/train_rvb) to get better alignments + + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate -1 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 7000 data/${train_set} $lang ${exp}/${gmm} $treedir || exit 1 +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm4 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm4 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/aspire-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + mkdir -p $dir/egs + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true" \ + --egs.chunk-width 160,140,110,80 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph_pp +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_pp_test $dir $graph_dir +fi + +if [ $stage -le 15 ]; then + rm $dir/.error 2>/dev/null || true + + for d in dev_rvb test_rvb; do + ( + if [ ! -f exp/nnet3/ivectors_${d}/ivector_online.scp ]; then + steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \ + data/${d}_hires exp/nnet3/extractor \ + exp/nnet3/ivectors_${d} || { echo "Failed i-vector extraction for data/${d}_hires"; touch $dir/.error; } + fi + + decode_dir=$dir/decode_${d}_pp + steps/nnet3/decode.sh --nj 30 --cmd "$decode_cmd" --config conf/decode.config \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + --frames-per-chunk 160 \ + --online-ivector-dir exp/nnet3/ivectors_${d} \ + $graph_dir data/${d}_hires $decode_dir || { echo "Failed decoding in $decode_dir"; touch $dir/.error; } + ) & + done + wait + + if [ -f $dir/.error ]; then + echo "Failed decoding." + exit 1 + fi +fi + +exit 0 + +if [ $stage -le 16 ]; then + local/nnet3/prep_test_aspire.sh --stage $test_stage --decode-num-jobs 30 --affix "$affix" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --window 10 --overlap 5 --frames-per-chunk 160 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + --sub-speaker-frames 6000 --max-count 75 --ivector-scale 0.75 \ + --pass2-decode-opts "--min-active 1000" \ + dev_aspire_ldc data/lang $dir/graph_pp $dir +fi + +if [ $stage -le 17 ]; then +# #Online decoding example + + local/nnet3/prep_test_aspire_online.sh --stage $test_stage --decode-num-jobs 30 --affix "$affix" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --window 10 --overlap 5 --frames-per-chunk 160 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 \ + --max-count 75 \ + --pass2-decode-opts "--min-active 1000" \ + dev_aspire_ldc data/lang $dir/graph_pp $dir +fi + + + + +exit 0; + diff --git a/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_1b.sh b/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_1b.sh new file mode 100755 index 00000000000..896129cb941 --- /dev/null +++ b/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_1b.sh @@ -0,0 +1,355 @@ +#!/bin/bash + +set -e + +# based on run_tdnn_7b.sh in the swbd recipe + +# configs for 'chain' +affix=v8 + +stage=0 +train_stage=-10 +get_egs_stage=-10 +test_stage=1 +nj=70 + +train_set=train_300k +exp=exp/semisup300k +gmm=tri5a + +tdnn_affix=_1b +tree_affix=bi_b +chain_affix= + +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +# training options +num_epochs=4 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +label_delay=5 + +# decode options +extra_left_context=50 +extra_right_context=0 +decode_iter= + +remove_egs=false +common_egs_dir= + +num_data_reps=3 +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat < $dir/uttlist + + utils/subset_data_dir.sh --utt-list $dir/uttlist data/train_rvb_hires \ + data/${rvb_train_set}_hires + + utils/subset_data_dir.sh --utt-list $dir/uttlist data/train_rvb \ + data/${rvb_train_set} +fi + +if [ $stage -le 9 ]; then + utils/data/perturb_data_dir_speed_3way.sh data/${rvb_train_set}_hires \ + data/${rvb_train_set}_sp_hires + utils/data/perturb_data_dir_volume.sh data/${rvb_train_set}_sp_hires + + steps/make_mfcc.sh --cmd "$train_cmd" --nj 30 --mfcc-config conf/mfcc_hires.conf \ + data/${rvb_train_set}_sp_hires + steps/compute_cmvn_stats.sh data/${rvb_train_set}_sp_hires +fi + +if [ $stage -le 10 ]; then + utils/data/perturb_data_dir_speed_3way.sh data/${train_set} \ + data/${train_set}_sp + + steps/make_mfcc.sh --cmd "$train_cmd" --nj 30 \ + data/${train_set}_sp + steps/compute_cmvn_stats.sh data/${train_set}_sp +fi + +norvb_lat_dir=${exp}/chain${chain_affix}/${gmm}_${train_set}_sp_lats + +if [ $stage -le 11 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 30 --cmd "$train_cmd" \ + --generate-ali-from-lats true data/${train_set}_sp \ + data/lang $gmm_dir $norvb_lat_dir || exit 1; + rm $norvb_lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 12 ]; then + mkdir -p $lat_dir + + utils/split_data.sh data/${rvb_train_set}_sp_hires $nj + + for n in `seq $nj`; do + awk '{print $1}' data/${rvb_train_set}_sp_hires/split$nj/$n/utt2spk | \ + perl -ane 's/rev[1-3]_//g' > $lat_dir/uttlist.$n.$nj + done + + rm -f $lat_dir/lat_tmp.*.{ark,scp} 2>/dev/null + + norvb_nj=$(cat $norvb_lat_dir/num_jobs) + $train_cmd JOB=1:$norvb_nj $lat_dir/log/copy_lattices.JOB.log \ + lattice-copy "ark:gunzip -c $norvb_lat_dir/lat.JOB.gz |" \ + ark,scp:$lat_dir/lat_tmp.JOB.ark,$lat_dir/lat_tmp.JOB.scp || exit 1 + + for n in `seq 3`; do + cat $lat_dir/lat_tmp.*.scp | awk -v n=$n '{print "rev"n"_"$1" "$2}' + done | \ + perl -pe 's:(rev[1-3])_(sp0.9|sp1.1)-:\2-\1_:g' | sort -k1,1 > $lat_dir/lat_rvb.scp + + $train_cmd JOB=1:$nj $lat_dir/log/copy_rvb_lattices.JOB.log \ + lattice-copy \ + "scp:utils/filter_scp.pl data/${rvb_train_set}_sp_hires/split$nj/JOB/utt2spk $lat_dir/lat_rvb.scp |" \ + "ark:| gzip -c > $lat_dir/lat.JOB.gz" || exit 1 + + rm $lat_dir/lat_tmp.* $lat_dir/lat_rvb.scp + + echo $nj > $lat_dir/num_jobs + + for f in cmvn_opts final.mdl splice_opts tree; do + cp $norvb_lat_dir/$f $lat_dir/$f + done +fi + +if [ $stage -le 13 ]; then + # Create a version of the lang/ directory that has one state per phone in the + # topo file. [note, it really has two states.. the first one is only repeated + # once, the second one has zero or more repeats.] + rm -rf $lang + cp -r data/lang $lang + silphonelist=$(cat $lang/phones/silence.csl) || exit 1; + nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1; + # Use our special topology... note that later on may have to tune this + # topology. + steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo +fi + +if [ $stage -le 14 ]; then + # Build a tree using our new topology. + # we build the tree using clean features (data/train) rather than + # the augmented features (data/train_rvb) to get better alignments + + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate -1 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 7000 data/${train_set}_sp $lang $norvb_lat_dir $treedir || exit 1 +fi + +if [ $stage -le 15 ]; then + if [ ! -f exp/nnet3/ivectors_${rvb_train_set}_sp/ivector_online.scp ]; then + steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \ + data/${rvb_train_set}_sp_hires exp/nnet3/extractor \ + exp/nnet3/ivectors_${rvb_train_set}_sp || { echo "Failed i-vector extraction for data/${d}_hires"; touch $dir/.error; } + fi +fi + +if [ $stage -le 16 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm4 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm4 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 17 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/aspire-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + mkdir -p $dir/egs + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true" \ + --egs.chunk-width 160,140,110,80 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph_pp +if [ $stage -le 18 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_pp_test $dir $graph_dir +fi + +if [ $stage -le 19 ]; then + rm $dir/.error 2>/dev/null || true + + for d in dev_rvb test_rvb; do + ( + if [ ! -f exp/nnet3/ivectors_${d}/ivector_online.scp ]; then + steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \ + data/${d}_hires exp/nnet3/extractor \ + exp/nnet3/ivectors_${d} || { echo "Failed i-vector extraction for data/${d}_hires"; touch $dir/.error; } + fi + + decode_dir=$dir/decode_${d}_pp + steps/nnet3/decode.sh --nj 30 --cmd "$decode_cmd" --config conf/decode.config \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + --frames-per-chunk 160 \ + --online-ivector-dir exp/nnet3/ivectors_${d} \ + $graph_dir data/${d}_hires $decode_dir || { echo "Failed decoding in $decode_dir"; touch $dir/.error; } + ) & + done + wait + + if [ -f $dir/.error ]; then + echo "Failed decoding." + exit 1 + fi +fi + +exit 0 + +if [ $stage -le 19 ]; then + local/nnet3/prep_test_aspire.sh --stage $test_stage --decode-num-jobs 30 --affix "$affix" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --window 10 --overlap 5 --frames-per-chunk 160 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + --sub-speaker-frames 6000 --max-count 75 --ivector-scale 0.75 \ + --pass2-decode-opts "--min-active 1000" \ + dev_aspire_ldc data/lang $dir/graph_pp $dir +fi + +if [ $stage -le 17 ]; then +# #Online decoding example + + local/nnet3/prep_test_aspire_online.sh --stage $test_stage --decode-num-jobs 30 --affix "$affix" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --window 10 --overlap 5 --frames-per-chunk 160 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 \ + --max-count 75 \ + --pass2-decode-opts "--min-active 1000" \ + dev_aspire_ldc data/lang $dir/graph_pp $dir +fi + + + + +exit 0; + diff --git a/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_kl_ts_1b.sh b/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_kl_ts_1b.sh new file mode 100755 index 00000000000..da1c9962c92 --- /dev/null +++ b/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_kl_ts_1b.sh @@ -0,0 +1,463 @@ +#!/bin/bash + +set -e + +# configs for 'chain' +affix=v8 + +stage=7 # skip ivector extractor training as it is already done for baseline system +train_stage=-10 +get_egs_stage=-10 +nj=70 +max_jobs_run=30 +test_stage=0 + +exp=exp/semisup300k + +# seed model params +src_dir=exp/semisup300k/chain_norvb/tdnn_lstm_1a_sp +treedir=exp/semisup300k/chain_norvb/tree_bi_a +src_ivector_extractor=exp/nnet3_norvb/extractor + +extractor=exp/nnet3/extractor + +sup_lat_dir=exp/semisup300k/chain/tri5b_train_300k_rvb_sp_lats +supervised_set=train_300k_rvb_sp + +norvb_unsupervised_set=train +unsupervised_set=train_rvb + +tdnn_affix=_1b +chain_affix=_kl_ts + +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +# training options +num_epochs=2 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +label_delay=5 + +lm_weights=1,3 # 3 - To compensate for using alignments before reverberation. +supervision_weights=1.0,1.0 +num_copies=2,1 + +# decode options +extra_left_context=50 +extra_right_context=0 +decode_iter= + +# training options +remove_egs=false +sup_egs_dir= +unsup_egs_dir= +unsup_egs_opts= + +num_data_reps=3 +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat < $unsup_lat_dir/uttlist.$n.$nj + #done + + rm -f $unsup_lat_dir/lat_tmp.*.{ark,scp} 2>/dev/null + + # Copy the lattices temporarily + norvb_nj=$(cat $norvb_unsup_lat_dir/num_jobs) + $train_cmd --max-jobs-run $max_jobs_run JOB=1:$norvb_nj $unsup_lat_dir/log/copy_lattices.JOB.log \ + lattice-copy --write-compact=false "ark:gunzip -c $norvb_unsup_lat_dir/lat.JOB.gz |" \ + ark,scp:$unsup_lat_dir/lat_tmp.JOB.ark,$unsup_lat_dir/lat_tmp.JOB.scp || exit 1 + + # Make copies of utterances for perturbed data + for n in `seq 3`; do + cat $unsup_lat_dir/lat_tmp.*.scp | awk -v n=$n '{print "rev"n"_"$1" "$2}' + done | sort -k1,1 > $unsup_lat_dir/lat_rvb.scp + + # Copy and dump the lattices for perturbed data + $train_cmd --max-jobs-run $max_jobs_run JOB=1:$nj $unsup_lat_dir/log/copy_rvb_lattices.JOB.log \ + lattice-copy --write-compact=false \ + "scp:utils/filter_scp.pl data/${unsupervised_set}_hires/split$nj/JOB/utt2spk $unsup_lat_dir/lat_rvb.scp |" \ + "ark:| gzip -c > $unsup_lat_dir/lat.JOB.gz" || exit 1 + + rm $unsup_lat_dir/lat_tmp.* $unsup_lat_dir/lat_rvb.scp + + echo $nj > $unsup_lat_dir/num_jobs + + for f in cmvn_opts final.mdl splice_opts tree frame_subsampling_factor; do + if [ -f $norvb_unsup_lat_dir/$f ]; then cp $norvb_unsup_lat_dir/$f $unsup_lat_dir/$f; fi + done +fi + +ln -sf ../final.mdl $unsup_lat_dir/final.mdl + +if [ $stage -le 12 ]; then + steps/best_path_weights.sh --cmd "$decode_cmd" \ + ${norvb_train_data_dir} $decode_lang ${norvb_unsup_lat_dir} \ + $src_dir/best_path_${norvb_unsupervised_set} +fi + +if [ $stage -le 13 ]; then + norvb_weights_dir=$src_dir/best_path_${norvb_unsupervised_set} + norvb_nj=$(cat $norvb_weights_dir/num_jobs) + + mkdir -p $src_dir/best_path_${unsupervised_set} + for n in `seq 3`; do + cat $norvb_weights_dir/weights.scp | awk -v n=$n '{print "rev"n"_"$1" "$2}' + done | sort -k1,1 > $src_dir/best_path_${unsupervised_set}/weights.scp +fi + +deriv_weights_scp=$src_dir/best_path_${unsupervised_set}/weights.scp + +if [ $stage -le 14 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${src_dir}/best_path_${norvb_unsupervised_set} $dir +fi + +if [ $stage -le 15 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm4 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm4 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine@$label_delay skip-in-init=true + output name=output-1 input=output.affine@$label_delay skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax@$label_delay skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax@$label_delay skip-in-init=true +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +frame_subsampling_factor=3 + +left_context=$[model_left_context + chunk_left_context] +right_context=$[model_right_context + chunk_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +egs_left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +egs_right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +egs_left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +egs_right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +cmvn_opts=`cat $src_dir/cmvn_opts` || exit 1 +lattice_lm_scale=0.5 + +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $src_dir/egs/info/frames_per_eg) + + if [ $stage -le 16 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/aspire-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor $frame_subsampling_factor \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $(dirname $extractor)/ivectors_${supervised_set} \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir $sup_lat_dir $sup_egs_dir + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set} + [ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + if [ $stage -le 17 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/aspire-$(date +'%m_%d_%H_%M')/s5/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + + steps/nnet3/chain/get_egs_ts.sh --cmd "$decode_cmd" \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam 4.0 \ + --deriv-weights-scp $deriv_weights_scp \ + --online-ivector-dir $(dirname $extractor)/ivectors_${unsupervised_set} \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + + touch $unsup_egs_dir/.nodelete + fi +fi + +if [ $stage -le 18 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 64 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. --lang2num-copies "$num_copies" \ + 2 $sup_egs_dir $unsup_egs_dir $dir/egs_comb +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 19 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $(dirname $extractor)/ivectors_${unsupervised_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --chain.kl-factor-schedule "output-0=0,0 output-1=1,1" \ + --chain.mmi-factor-schedule "output-0=1,1 output-1=0,0" \ + --egs.stage $get_egs_stage --egs.get-egs-script=$get_egs_script \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true $egs_opts --max-jobs-run $max_jobs_run" \ + --chain.right-tolerance 1 --chain.left-tolerance 1 \ + --chain.alignment-subsampling-factor 1 \ + --egs.chunk-width 160,140,110,80 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir $dir/egs_comb \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph_pp +if [ $stage -le 20 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_pp_test $dir $graph_dir +fi + +if [ $stage -le 21 ]; then +#%WER 27.8 | 2120 27217 | 78.2 13.6 8.2 6.0 27.8 75.9 | -0.613 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iterfinal_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys + local/nnet3/decode.sh --stage $test_stage --decode-num-jobs 30 --affix "$affix" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --window 10 --overlap 5 \ + --sub-speaker-frames 6000 --max-count 75 --ivector-scale 0.75 \ + --pass2-decode-opts "--min-active 1000" \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + dev_aspire_ldc data/lang $dir/graph_pp $dir +fi + +if [ $stage -le 22 ]; then + rm $dir/.error 2>/dev/null || true + + for d in dev_rvb test_rvb; do + ( + if [ ! -f exp/nnet3/ivectors_${d}/ivector_online.scp ]; then + steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \ + data/${d}_hires exp/nnet3/extractor \ + exp/nnet3/ivectors_${d} || { echo "Failed i-vector extraction for data/${d}_hires"; touch $dir/.error; } + fi + + decode_dir=$dir/decode_${d}_pp + steps/nnet3/decode.sh --nj 30 --cmd "$decode_cmd" --config conf/decode.config \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + --frames-per-chunk 160 \ + --online-ivector-dir exp/nnet3/ivectors_${d} \ + $graph_dir data/${d}_hires $decode_dir || { echo "Failed decoding in $decode_dir"; touch $dir/.error; } + ) & + done + wait + + if [ -f $dir/.error ]; then + echo "Failed decoding." + exit 1 + fi +fi + + +exit 0 + +#if [ $stage -le 15 ]; then +# #Online decoding example +# %WER 31.5 | 2120 27224 | 74.0 13.0 13.0 5.5 31.5 77.1 | -0.558 | exp/chain/tdnn_7b_online/decode_dev_aspire_whole_uniformsegmented_win10_over5_v9_online_iterfinal_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys + +# local/nnet3/prep_test_aspire_online.sh --stage 2 --decode-num-jobs 30 --affix "v7" \ +# --acwt 1.0 --post-decode-acwt 10.0 \ +# --window 10 --overlap 5 \ +# --max-count 75 \ +# --pass2-decode-opts "--min-active 1000" \ +# dev_aspire data/lang $dir/graph_pp exp/chain/tdnn_7b +#fi + + + + +exit 0; + +# %WER 32.7 | 2120 27222 | 73.6 15.3 11.2 6.3 32.7 78.5 | -0.530 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter100_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 30.4 | 2120 27211 | 74.8 12.7 12.5 5.1 30.4 77.0 | -0.458 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter200_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys +# %WER 29.1 | 2120 27216 | 76.6 13.8 9.6 5.7 29.1 76.8 | -0.527 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter300_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.8 | 2120 27211 | 77.0 13.8 9.2 5.8 28.8 76.3 | -0.587 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter400_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.7 | 2120 27218 | 77.1 13.8 9.1 5.8 28.7 77.0 | -0.566 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter500_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.5 | 2120 27210 | 77.5 13.9 8.7 6.0 28.5 76.1 | -0.596 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter600_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.2 | 2120 27217 | 77.0 12.4 10.6 5.2 28.2 75.8 | -0.540 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter700_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys +# %WER 28.4 | 2120 27218 | 77.6 13.6 8.8 6.0 28.4 76.3 | -0.607 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter800_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.2 | 2120 27208 | 77.4 12.6 10.0 5.6 28.2 76.6 | -0.555 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter900_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys +# %WER 27.8 | 2120 27214 | 78.0 13.5 8.5 5.9 27.8 75.9 | -0.631 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1000_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 27.9 | 2120 27216 | 77.6 13.0 9.4 5.5 27.9 76.1 | -0.544 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1200_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys +# %WER 27.8 | 2120 27216 | 77.4 13.1 9.5 5.3 27.8 75.7 | -0.615 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1300_pp_fg/score_9/penalty_0.25/ctm.filt.filt.sys +# %WER 27.7 | 2120 27220 | 78.1 13.6 8.3 5.8 27.7 75.1 | -0.569 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1400_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 27.7 | 2120 27217 | 78.1 13.6 8.3 5.9 27.7 75.1 | -0.605 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1500_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys diff --git a/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_kl_ts_ami_1a.sh b/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_kl_ts_ami_1a.sh new file mode 100755 index 00000000000..9436e56f0a2 --- /dev/null +++ b/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_kl_ts_ami_1a.sh @@ -0,0 +1,290 @@ +#!/bin/bash + +set -e -o pipefail -u + +# configs for 'chain' +stage=0 +train_stage=-10 +get_egs_stage=-10 +nj=70 +max_jobs_run=30 + +# seed model params +src_dir=exp/semisup300k/chain/tdnn_lstm_1b_sp +treedir=exp/semisup300k/chain/tree_bi_b +src_ivector_extractor=exp/nnet3/extractor + +tgt_data_dir=data/ami_sdm1_train + +student_mfcc_config=conf/mfcc_hires_16kHz.conf + +student_graph_affix=_pp +student_lang=data/lang_pp_test +student_rescore_lang=data/lang_pp_test_fg + +tgt_graph_affix=_ami +tgt_lang=data/lang_ami + +tdnn_affix=_1a +chain_affix=_kl_ts_ami_sdm1 +nnet3_affix=_semisup_ts_ami_sdm1 + +hidden_dim=512 +cell_dim=512 +projection_dim=128 + +# training options +num_epochs=2 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +label_delay=5 + +remove_egs=false +common_egs_dir= + +# decode options +test_sets="ami_sdm1_dev_16kHz ami_sdm1_eval_16kHz" + +scoring_script=local/score.sh + +extra_left_context=50 +extra_right_context=0 + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm4 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm4 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine@$label_delay skip-in-init=true + output name=output-1 input=output.affine@$label_delay skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax@$label_delay skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax@$label_delay skip-in-init=true +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +egs_opts="--lattice-lm-scale 0.5 --lattice-prune-beam 4.0 --deriv-weights-scp $deriv_weights_scp" + +if [ $stage -le 13 ]; then + steps/nnet3/chain/train_ts.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_$(basename $student_data_dir) \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true $egs_opts --max-jobs-run $max_jobs_run" \ + --egs.chunk-width 160,140,110,80 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $student_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph${tgt_graph_affix} +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 ${tgt_lang} $dir $graph_dir +fi + +if [ $stage -le 15 ]; then + for dset in $test_sets; do + ( + decode_dir=$dir/decode${tgt_graph_affix}_${dset} + + steps/nnet3/decode.sh --nj $nj --cmd "$decode_cmd" --config conf/decode.config \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + --frames-per-chunk 160 \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset} \ + --skip-scoring true \ + $graph_dir data/${dset}_hires $decode_dir || { echo "Failed decoding in $decode_dir"; touch $dir/.error; } + + $scoring_script --cmd "$decode_cmd" \ + data/${dset}_hires $graph_dir $decode_dir + ) & + done + wait + + if [ -f $dir/.error ]; then + echo "Failed decoding." + exit 1 + fi +fi diff --git a/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_kl_ts_ami_1b.sh b/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_kl_ts_ami_1b.sh new file mode 100755 index 00000000000..37977e56ba7 --- /dev/null +++ b/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_kl_ts_ami_1b.sh @@ -0,0 +1,292 @@ +#!/bin/bash + +set -e -o pipefail -u + +# configs for 'chain' +stage=0 +train_stage=-10 +get_egs_stage=-10 +nj=70 +max_jobs_run=30 + +# seed model params +src_dir=exp/semisup300k/chain/tdnn_lstm_1b_sp +treedir=exp/semisup300k/chain/tree_bi_b +src_ivector_extractor=exp/nnet3/extractor + +tgt_data_dir=data/ami_sdm1_train + +student_mfcc_config=conf/mfcc_hires_16kHz.conf + +student_graph_affix=_pp +student_lang=data/lang_pp_test +student_rescore_lang=data/lang_pp_test_fg + +tgt_graph_affix=_ami +tgt_lang=data/lang_ami + +tdnn_affix=_1b +chain_affix=_kl_ts_ami_sdm1 +nnet3_affix=_semisup_ts_ami_sdm1 + +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +# training options +num_epochs=2 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +label_delay=5 + +lattice_lm_scale=0.5 + +remove_egs=false +common_egs_dir= + +# decode options +test_sets="ami_sdm1_dev_16kHz ami_sdm1_eval_16kHz" + +scoring_script=local/score.sh + +extra_left_context=50 +extra_right_context=0 + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +echo "use the other script" +exit 1 + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine@$label_delay skip-in-init=true + output name=output-1 input=output.affine@$label_delay skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax@$label_delay skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax@$label_delay skip-in-init=true +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +egs_opts="--lattice-lm-scale $lattice_lm_scale --lattice-prune-beam 4.0 --deriv-weights-scp $deriv_weights_scp" + +if [ $stage -le 13 ]; then + steps/nnet3/chain/train_ts.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_$(basename $student_data_dir) \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true $egs_opts --max-jobs-run $max_jobs_run" \ + --egs.chunk-width 160,140,110,80 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $student_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph${tgt_graph_affix} +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 ${tgt_lang} $dir $graph_dir +fi + +if [ $stage -le 15 ]; then + for dset in $test_sets; do + ( + decode_dir=$dir/decode${tgt_graph_affix}_${dset} + + steps/nnet3/decode.sh --nj $nj --cmd "$decode_cmd" --config conf/decode.config \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + --frames-per-chunk 160 \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset} \ + --skip-scoring true \ + $graph_dir data/${dset}_hires $decode_dir || { echo "Failed decoding in $decode_dir"; touch $dir/.error; } + + $scoring_script --cmd "$decode_cmd" \ + data/${dset}_hires $graph_dir $decode_dir + ) & + done + wait + + if [ -f $dir/.error ]; then + echo "Failed decoding." + exit 1 + fi +fi diff --git a/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_kl_ts_ami_subset_1a.sh b/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_kl_ts_ami_subset_1a.sh new file mode 100755 index 00000000000..083b54a61e2 --- /dev/null +++ b/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_kl_ts_ami_subset_1a.sh @@ -0,0 +1,420 @@ +#!/bin/bash + +set -e -o pipefail -u + +# configs for 'chain' +stage=0 +train_stage=-10 +get_egs_stage=-10 +nj=70 +max_jobs_run=30 + +# seed model params +src_dir=exp/semisup300k/chain/tdnn_lstm_1b_sp +treedir=exp/semisup300k/chain/tree_bi_b +src_ivector_extractor=exp/nnet3/extractor + +supervised_data_dir=data/ami_sdm1_train_reco12 +unsupervised_data_dir=data/ami_sdm1_train + +student_mfcc_config=conf/mfcc_hires_16kHz.conf + +student_graph_affix=_pp +student_lang=data/lang_pp_test +student_rescore_lang=data/lang_pp_test_fg + +tgt_graph_affix=_ami +tgt_lang=data/lang_ami + +supervision_weights=1.0,1.0 +num_copies=1,1 +lm_weights=1,1 + +tdnn_affix=_1a +chain_affix=_semisup_ts_ami_subset_sdm1 +nnet3_affix=_semisup_ts_ami_subset_sdm1 + +hidden_dim=512 +cell_dim=512 +projection_dim=128 + +# training options +num_epochs=2 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +label_delay=5 + +remove_egs=false +common_egs_dir= + +lattice_lm_scale=0.5 +sup_egs_dir= +unsup_egs_dir= +unsup_egs_opts= + +# decode options +test_sets="ami_sdm1_dev_16kHz ami_sdm1_eval_16kHz" + +scoring_script=local/score.sh + +extra_left_context=50 +extra_right_context=0 + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm4 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm4 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine@$label_delay skip-in-init=true + output name=output-1 input=output.affine@$label_delay skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax@$label_delay skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax@$label_delay skip-in-init=true +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +frame_subsampling_factor=3 + +left_context=$[model_left_context + chunk_left_context] +right_context=$[model_right_context + chunk_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +egs_left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +egs_right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +egs_left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +egs_right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +cmvn_opts=`cat $src_dir/cmvn_opts` || exit 1 + +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set}_sp + frames_per_eg=$(cat $src_dir/egs/info/frames_per_eg) + + if [ $stage -le 16 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/aspire-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 1 \ + --left-tolerance 1 --right-tolerance 1 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_$(basename $unsup_student_data_dir) \ + --generate-egs-scp true \ + $sup_student_data_dir $dir $sup_lat_dir $sup_egs_dir + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsup_frames_per_eg=150 +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set} + [ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + if [ $stage -le 17 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/aspire-$(date +'%m_%d_%H_%M')/s5/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + + steps/nnet3/chain/get_egs_ts.sh --cmd "$decode_cmd" \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam 4.0 \ + --deriv-weights-scp $deriv_weights_scp \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_$(basename $unsup_student_data_dir) \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_sp_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + + touch $unsup_egs_dir/.nodelete + fi +fi + +if [ $stage -le 18 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 64 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. --lang2num-copies "$num_copies" \ + 2 $sup_egs_dir $unsup_egs_dir $dir/egs_comb +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 19 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_$(basename $unsup_student_data_dir) \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true --max-jobs-run $max_jobs_run" \ + --chain.right-tolerance 1 --chain.left-tolerance 1 \ + --chain.alignment-subsampling-factor 1 \ + --egs.chunk-width 160,140,110,80 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $unsup_student_data_dir \ + --tree-dir $treedir \ + --lat-dir $unsup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph${tgt_graph_affix} +if [ $stage -le 20 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 ${tgt_lang} $dir $graph_dir +fi + +if [ $stage -le 21 ]; then + for dset in $test_sets; do + ( + decode_dir=$dir/decode${tgt_graph_affix}_${dset} + + steps/nnet3/decode.sh --nj $nj --cmd "$decode_cmd" --config conf/decode.config \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + --frames-per-chunk 160 \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset} \ + --skip-scoring true \ + $graph_dir data/${dset}_hires $decode_dir || { echo "Failed decoding in $decode_dir"; touch $dir/.error; } + + $scoring_script --cmd "$decode_cmd" \ + data/${dset}_hires $graph_dir $decode_dir + ) & + done + wait + + if [ -f $dir/.error ]; then + echo "Failed decoding." + exit 1 + fi +fi diff --git a/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_kl_wt_ami_1a.sh b/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_kl_wt_ami_1a.sh new file mode 100755 index 00000000000..0004a29f188 --- /dev/null +++ b/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_kl_wt_ami_1a.sh @@ -0,0 +1,286 @@ +#!/bin/bash + +set -e -o pipefail -u + +# configs for 'chain' +stage=0 +train_stage=-10 +get_egs_stage=-10 +nj=70 +max_jobs_run=30 + +# seed model params +src_dir=exp/semisup300k/chain/tdnn_lstm_1b_sp +treedir=exp/semisup300k/chain/tree_bi_b +src_ivector_extractor=exp/nnet3/extractor + +tgt_data_dir=data/ami_sdm1_train + +student_mfcc_config=conf/mfcc_hires.conf + +student_graph_affix=_pp +student_lang=data/lang_pp_test +student_rescore_lang=data/lang_pp_test_fg + +tgt_graph_affix=_ami +tgt_lang=data/lang_ami + +tdnn_affix=_1a +chain_affix=_kl_wt_ami_sdm1 +nnet3_affix=_semisup_ts_ami_sdm1 + +primary_lr_factor=0.25 + +# training options +num_epochs=2 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +label_delay=5 + +remove_egs=false +common_egs_dir= + +# decode options +test_sets="ami_sdm1_dev ami_sdm1_eval" + +scoring_script=local/score.sh + +extra_left_context=50 +extra_right_context=0 + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig +# input dim=100 name=ivector +# input dim=40 name=input +# +# # please note that it is important to have input layer with the name=input +# # as the layer immediately preceding the fixed-affine-layer to enable +# # the use of short notation for the descriptor +# fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat +# +# # the first splicing is moved before the lda layer, so no splicing here +# relu-batchnorm-layer name=tdnn1 dim=$hidden_dim +# relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim +# relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim +# +# fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts +# relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim +# relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim +# fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts +# relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim +# relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim +# fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts +# relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=$hidden_dim +# relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=$hidden_dim +# fast-lstmp-layer name=lstm4 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts +# +# ## adding the layers for chain branch +# output-layer name=output input=lstm4 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 +# +# # adding the layers for xent branch +# # This block prints the configs for a separate output that will be +# # trained with a cross-entropy objective in the 'chain' models... this +# # has the effect of regularizing the hidden parts of the model. we use +# # 0.5 / args.xent_regularize as the learning rate factor- the factor of +# # 0.5 / args.xent_regularize is suitable as it means the xent +# # final-layer learns at a rate independent of the regularization +# # constant; and the 0.5 was tuned so as to make the relative progress +# # similar in the xent and regular final layers. +# output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +# +# output name=output-0 input=output.affine@$label_delay skip-in-init=true +# output name=output-1 input=output.affine@$label_delay skip-in-init=true +# +# output name=output-0-xent input=output-xent.log-softmax@$label_delay skip-in-init=true +# output name=output-1-xent input=output-xent.log-softmax@$label_delay skip-in-init=true +#EOF +# steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +#fi + +if [ $stage -le 12 ]; then + # Set the learning-rate-factor for all transferred layers but the last output + # layer to primary_lr_factor. + $train_cmd $dir/log/generate_input_mdl.log \ + nnet3-copy --edits="set-learning-rate-factor name=* learning-rate-factor=$primary_lr_factor; set-learning-rate-factor name=output* learning-rate-factor=1.0" \ + $src_dir/final.mdl $dir/input.raw || exit 1; +fi + +egs_opts="--lattice-lm-scale 0.5 --lattice-prune-beam 4.0 --deriv-weights-scp $deriv_weights_scp" + +if [ $stage -le 13 ]; then + steps/nnet3/chain/train_ts.py --stage $train_stage \ + --cmd "$decode_cmd" --trainer.input-model $dir/input.raw \ + --feat.online-ivector-dir $teacher_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true $egs_opts --max-jobs-run $max_jobs_run" \ + --egs.chunk-width 160,140,110,80 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $teacher_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph${tgt_graph_affix} +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 ${tgt_lang} $dir $graph_dir +fi + +if [ $stage -le 15 ]; then + for dset in $test_sets; do + ( + decode_dir=$dir/decode${tgt_graph_affix}_${dset} + + steps/nnet3/decode.sh --nj $nj --cmd "$decode_cmd" --config conf/decode.config \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + --frames-per-chunk 160 \ + --online-ivector-dir $(dirname $src_ivector_extractor)/ivectors_${dset} \ + --skip-scoring true \ + $graph_dir data/${dset}_hires $decode_dir || { echo "Failed decoding in $decode_dir"; touch $dir/.error; } + + $scoring_script --cmd "$decode_cmd" \ + data/${dset}_hires $graph_dir $decode_dir + ) & + done + wait + + if [ -f $dir/.error ]; then + echo "Failed decoding." + exit 1 + fi +fi diff --git a/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_norvb_1a.sh b/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_norvb_1a.sh new file mode 100755 index 00000000000..527fe36bb37 --- /dev/null +++ b/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_norvb_1a.sh @@ -0,0 +1,288 @@ +#!/bin/bash + +set -e + +# based on run_tdnn_7b.sh in the swbd recipe + +# configs for 'chain' +affix=v8 + +stage=0 +train_stage=-10 +get_egs_stage=-10 +test_stage=1 +nj=70 + +train_set=train_300k +exp=exp/semisup300k +gmm=tri5a + +tdnn_affix=_1a +tree_affix=bi_a +nnet3_affix=_norvb +chain_affix=_norvb + +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +# training options +num_epochs=4 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +label_delay=5 + +# decode options +extra_left_context=50 +extra_right_context=0 +decode_iter= + +remove_egs=false +common_egs_dir= + +num_data_reps=3 +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat < $dir/uttlist + + utils/subset_data_dir.sh --utt-list $dir/uttlist data/train_sp_hires \ + data/${train_set}_sp_hires + + utils/subset_data_dir.sh --utt-list $dir/uttlist data/train_sp \ + data/${train_set}_sp +fi + +if [ $stage -le 8 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 30 --cmd "$train_cmd" \ + --generate-ali-from-lats true data/${train_set}_sp \ + data/lang $gmm_dir $lat_dir || exit 1; + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 10 ]; then + # Create a version of the lang/ directory that has one state per phone in the + # topo file. [note, it really has two states.. the first one is only repeated + # once, the second one has zero or more repeats.] + rm -rf $lang + cp -r data/lang $lang + silphonelist=$(cat $lang/phones/silence.csl) || exit 1; + nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1; + # Use our special topology... note that later on may have to tune this + # topology. + steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate -1 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 7000 data/${train_set}_sp $lang $lat_dir $treedir || exit 1 +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm4 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm4 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/aspire-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + mkdir -p $dir/egs + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true" \ + --egs.chunk-width 160,140,110,80 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph_pp +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_pp_test $dir $graph_dir +fi + +if [ $stage -le 15 ]; then + rm $dir/.error 2>/dev/null || true + + for d in dev test; do + ( + if [ ! -f exp/nnet3/ivectors_${d}/ivector_online.scp ]; then + steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \ + data/${d}_hires exp/nnet3${nnet3_affix}/extractor \ + exp/nnet3${nnet3_affix}/ivectors_${d} || { echo "Failed i-vector extraction for data/${d}_hires"; touch $dir/.error; } + fi + + decode_dir=$dir/decode_${d}_pp + steps/nnet3/decode.sh --nj 30 --cmd "$decode_cmd" --config conf/decode.config \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + --frames-per-chunk 160 \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${d} \ + $graph_dir data/${d}_hires $decode_dir || { echo "Failed decoding in $decode_dir"; touch $dir/.error; } + ) & + done + wait + + if [ -f $dir/.error ]; then + echo "Failed decoding." + exit 1 + fi +fi + +exit 0 + +if [ $stage -le 16 ]; then + local/nnet3/prep_test_aspire.sh --stage $test_stage --decode-num-jobs 30 --affix "$affix" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --window 10 --overlap 5 --frames-per-chunk 160 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + --sub-speaker-frames 6000 --max-count 75 --ivector-scale 0.75 \ + --pass2-decode-opts "--min-active 1000" \ + dev_aspire_ldc data/lang $dir/graph_pp $dir +fi + +if [ $stage -le 17 ]; then +# #Online decoding example + + local/nnet3/prep_test_aspire_online.sh --stage $test_stage --decode-num-jobs 30 --affix "$affix" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --window 10 --overlap 5 --frames-per-chunk 160 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 \ + --max-count 75 \ + --pass2-decode-opts "--min-active 1000" \ + dev_aspire_ldc data/lang $dir/graph_pp $dir +fi + + + + +exit 0; + diff --git a/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_ts_1a.sh b/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_ts_1a.sh new file mode 100755 index 00000000000..9bdd1c99c65 --- /dev/null +++ b/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_ts_1a.sh @@ -0,0 +1,370 @@ +#!/bin/bash + +set -e + +# configs for 'chain' +affix=v8 + +stage=7 # skip ivector extractor training as it is already done for baseline system +train_stage=-10 +get_egs_stage=-10 +nj=70 +max_jobs_run=30 +test_stage=0 + +exp=exp/semisup300k + +# seed model params +src_dir=exp/semisup300k/chain_norvb/tdnn_lstm_1a_sp +treedir=exp/semisup300k/chain_norvb/tree_bi_a +src_ivector_extractor=exp/nnet3_norvb/extractor + +tdnn_affix=_1a +chain_affix=_semisup_ts + +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +# training options +num_epochs=2 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +label_delay=5 + +# decode options +extra_left_context=50 +extra_right_context=0 +decode_iter= + +# training options +remove_egs=false +common_egs_dir= + +num_data_reps=3 +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat < $lat_dir/uttlist.$n.$nj + #done + + rm -f $lat_dir/lat_tmp.*.{ark,scp} 2>/dev/null + + # Copy the lattices temporarily + norvb_nj=$(cat $norvb_lat_dir/num_jobs) + $train_cmd --max-jobs-run $max_jobs_run JOB=1:$norvb_nj $lat_dir/log/copy_lattices.JOB.log \ + lattice-copy --write-compact=false "ark:gunzip -c $norvb_lat_dir/lat.JOB.gz |" \ + ark,scp:$lat_dir/lat_tmp.JOB.ark,$lat_dir/lat_tmp.JOB.scp || exit 1 + + # Make copies of utterances for perturbed data + for n in `seq 3`; do + cat $lat_dir/lat_tmp.*.scp | awk -v n=$n '{print "rev"n"_"$1" "$2}' + done | sort -k1,1 > $lat_dir/lat_rvb.scp + + # Copy and dump the lattices for perturbed data + $train_cmd --max-jobs-run $max_jobs_run JOB=1:$nj $lat_dir/log/copy_rvb_lattices.JOB.log \ + lattice-copy --write-compact=false \ + "scp:utils/filter_scp.pl data/${train_set}/split$nj/JOB/utt2spk $lat_dir/lat_rvb.scp |" \ + "ark:| gzip -c > $lat_dir/lat.JOB.gz" || exit 1 + + rm $lat_dir/lat_tmp.* $lat_dir/lat_rvb.scp + + echo $nj > $lat_dir/num_jobs + + for f in cmvn_opts final.mdl splice_opts tree frame_subsampling_factor; do + if [ -f $norvb_lat_dir/$f ]; then cp $norvb_lat_dir/$f $lat_dir/$f; fi + done +fi + +ln -sf ../final.mdl $lat_dir/final.mdl + +if [ $stage -le 12 ]; then + steps/best_path_weights.sh --cmd "$decode_cmd" \ + ${norvb_train_data_dir} $decode_lang ${norvb_lat_dir} \ + $src_dir/best_path_${norvb_train_set} +fi + +if [ $stage -le 13 ]; then + norvb_weights_dir=$src_dir/best_path_${norvb_train_set} + norvb_nj=$(cat $norvb_weights_dir/num_jobs) + + mkdir -p $src_dir/best_path_${train_set} + for n in `seq 3`; do + cat $norvb_weights_dir/weights.scp | awk -v n=$n '{print "rev"n"_"$1" "$2}' + done | sort -k1,1 > $src_dir/best_path_${train_set}/weights.scp +fi + +egs_opts="$egs_opts --deriv-weights-scp $src_dir/best_path_${train_set}/weights.scp" + +if [ $stage -le 14 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm4 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm4 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 15 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/aspire-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + mkdir -p $dir/egs + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --egs.stage $get_egs_stage --egs.get-egs-script=$get_egs_script \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true $egs_opts --max-jobs-run $max_jobs_run" \ + --chain.right-tolerance 1 --chain.left-tolerance 1 \ + --chain.alignment-subsampling-factor 1 \ + --egs.chunk-width 160,140,110,80 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph_pp +if [ $stage -le 16 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_pp_test $dir $graph_dir +fi + +if [ $stage -le 17 ]; then +#%WER 27.8 | 2120 27217 | 78.2 13.6 8.2 6.0 27.8 75.9 | -0.613 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iterfinal_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys + local/nnet3/decode.sh --stage $test_stage --decode-num-jobs 30 --affix "$affix" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --window 10 --overlap 5 \ + --sub-speaker-frames 6000 --max-count 75 --ivector-scale 0.75 \ + --pass2-decode-opts "--min-active 1000" \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + dev_aspire_ldc data/lang $dir/graph_pp $dir +fi + +if [ $stage -le 22 ]; then + rm $dir/.error 2>/dev/null || true + + for d in dev_rvb test_rvb; do + ( + if [ ! -f exp/nnet3/ivectors_${d}/ivector_online.scp ]; then + steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \ + data/${d}_hires exp/nnet3/extractor \ + exp/nnet3/ivectors_${d} || { echo "Failed i-vector extraction for data/${d}_hires"; touch $dir/.error; } + fi + + decode_dir=$dir/decode_${d}_pp + steps/nnet3/decode.sh --nj 30 --cmd "$decode_cmd" --config conf/decode.config \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + --frames-per-chunk 160 \ + --online-ivector-dir exp/nnet3/ivectors_${d} \ + $graph_dir data/${d}_hires $decode_dir || { echo "Failed decoding in $decode_dir"; touch $dir/.error; } + ) & + done + wait + + if [ -f $dir/.error ]; then + echo "Failed decoding." + exit 1 + fi +fi + +exit 0 + +#if [ $stage -le 15 ]; then +# #Online decoding example +# %WER 31.5 | 2120 27224 | 74.0 13.0 13.0 5.5 31.5 77.1 | -0.558 | exp/chain/tdnn_7b_online/decode_dev_aspire_whole_uniformsegmented_win10_over5_v9_online_iterfinal_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys + +# local/nnet3/prep_test_aspire_online.sh --stage 2 --decode-num-jobs 30 --affix "v7" \ +# --acwt 1.0 --post-decode-acwt 10.0 \ +# --window 10 --overlap 5 \ +# --max-count 75 \ +# --pass2-decode-opts "--min-active 1000" \ +# dev_aspire data/lang $dir/graph_pp exp/chain/tdnn_7b +#fi + + + + +exit 0; + +# %WER 32.7 | 2120 27222 | 73.6 15.3 11.2 6.3 32.7 78.5 | -0.530 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter100_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 30.4 | 2120 27211 | 74.8 12.7 12.5 5.1 30.4 77.0 | -0.458 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter200_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys +# %WER 29.1 | 2120 27216 | 76.6 13.8 9.6 5.7 29.1 76.8 | -0.527 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter300_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.8 | 2120 27211 | 77.0 13.8 9.2 5.8 28.8 76.3 | -0.587 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter400_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.7 | 2120 27218 | 77.1 13.8 9.1 5.8 28.7 77.0 | -0.566 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter500_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.5 | 2120 27210 | 77.5 13.9 8.7 6.0 28.5 76.1 | -0.596 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter600_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.2 | 2120 27217 | 77.0 12.4 10.6 5.2 28.2 75.8 | -0.540 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter700_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys +# %WER 28.4 | 2120 27218 | 77.6 13.6 8.8 6.0 28.4 76.3 | -0.607 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter800_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.2 | 2120 27208 | 77.4 12.6 10.0 5.6 28.2 76.6 | -0.555 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter900_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys +# %WER 27.8 | 2120 27214 | 78.0 13.5 8.5 5.9 27.8 75.9 | -0.631 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1000_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 27.9 | 2120 27216 | 77.6 13.0 9.4 5.5 27.9 76.1 | -0.544 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1200_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys +# %WER 27.8 | 2120 27216 | 77.4 13.1 9.5 5.3 27.8 75.7 | -0.615 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1300_pp_fg/score_9/penalty_0.25/ctm.filt.filt.sys +# %WER 27.7 | 2120 27220 | 78.1 13.6 8.3 5.8 27.7 75.1 | -0.569 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1400_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 27.7 | 2120 27217 | 78.1 13.6 8.3 5.9 27.7 75.1 | -0.605 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1500_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys diff --git a/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_ts_1b.sh b/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_ts_1b.sh new file mode 100755 index 00000000000..d2affd9372d --- /dev/null +++ b/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_ts_1b.sh @@ -0,0 +1,460 @@ +#!/bin/bash + +set -e + +# configs for 'chain' +affix=v8 + +stage=7 # skip ivector extractor training as it is already done for baseline system +train_stage=-10 +get_egs_stage=-10 +nj=70 +max_jobs_run=30 +test_stage=0 + +exp=exp/semisup300k + +# seed model params +src_dir=exp/semisup300k/chain_norvb/tdnn_lstm_1a_sp +treedir=exp/semisup300k/chain_norvb/tree_bi_a +src_ivector_extractor=exp/nnet3_norvb/extractor + +extractor=exp/nnet3/extractor + +sup_lat_dir=exp/semisup300k/chain/tri5b_train_300k_rvb_sp_lats +supervised_set=train_300k_rvb_sp + +norvb_unsupervised_set=train +unsupervised_set=train_rvb + +tdnn_affix=_1b +chain_affix=_semisup_ts + +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +# training options +num_epochs=2 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +label_delay=5 + +lm_weights=1,3 # 3 - To compensate for using alignments before reverberation. +supervision_weights=1.0,1.0 +num_copies=2,1 + +# decode options +extra_left_context=50 +extra_right_context=0 +decode_iter= + +# training options +remove_egs=false +sup_egs_dir= +unsup_egs_dir= +unsup_egs_opts= + +num_data_reps=3 +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat < $unsup_lat_dir/uttlist.$n.$nj + #done + + rm -f $unsup_lat_dir/lat_tmp.*.{ark,scp} 2>/dev/null + + # Copy the lattices temporarily + norvb_nj=$(cat $norvb_unsup_lat_dir/num_jobs) + $train_cmd --max-jobs-run $max_jobs_run JOB=1:$norvb_nj $unsup_lat_dir/log/copy_lattices.JOB.log \ + lattice-copy --write-compact=false "ark:gunzip -c $norvb_unsup_lat_dir/lat.JOB.gz |" \ + ark,scp:$unsup_lat_dir/lat_tmp.JOB.ark,$unsup_lat_dir/lat_tmp.JOB.scp || exit 1 + + # Make copies of utterances for perturbed data + for n in `seq 3`; do + cat $unsup_lat_dir/lat_tmp.*.scp | awk -v n=$n '{print "rev"n"_"$1" "$2}' + done | sort -k1,1 > $unsup_lat_dir/lat_rvb.scp + + # Copy and dump the lattices for perturbed data + $train_cmd --max-jobs-run $max_jobs_run JOB=1:$nj $unsup_lat_dir/log/copy_rvb_lattices.JOB.log \ + lattice-copy --write-compact=false \ + "scp:utils/filter_scp.pl data/${unsupervised_set}_hires/split$nj/JOB/utt2spk $unsup_lat_dir/lat_rvb.scp |" \ + "ark:| gzip -c > $unsup_lat_dir/lat.JOB.gz" || exit 1 + + rm $unsup_lat_dir/lat_tmp.* $unsup_lat_dir/lat_rvb.scp + + echo $nj > $unsup_lat_dir/num_jobs + + for f in cmvn_opts final.mdl splice_opts tree frame_subsampling_factor; do + if [ -f $norvb_unsup_lat_dir/$f ]; then cp $norvb_unsup_lat_dir/$f $unsup_lat_dir/$f; fi + done +fi + +ln -sf ../final.mdl $unsup_lat_dir/final.mdl + +if [ $stage -le 12 ]; then + steps/best_path_weights.sh --cmd "$decode_cmd" \ + ${norvb_train_data_dir} $decode_lang ${norvb_unsup_lat_dir} \ + $src_dir/best_path_${norvb_unsupervised_set} +fi + +if [ $stage -le 13 ]; then + norvb_weights_dir=$src_dir/best_path_${norvb_unsupervised_set} + norvb_nj=$(cat $norvb_weights_dir/num_jobs) + + mkdir -p $src_dir/best_path_${unsupervised_set} + for n in `seq 3`; do + cat $norvb_weights_dir/weights.scp | awk -v n=$n '{print "rev"n"_"$1" "$2}' + done | sort -k1,1 > $src_dir/best_path_${unsupervised_set}/weights.scp +fi + +deriv_weights_scp=$src_dir/best_path_${unsupervised_set}/weights.scp + +if [ $stage -le 14 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${src_dir}/best_path_${norvb_unsupervised_set} $dir +fi + +if [ $stage -le 15 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm4 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm4 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine@$label_delay skip-in-init=true + output name=output-1 input=output.affine@$label_delay skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax@$label_delay skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax@$label_delay skip-in-init=true +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +frame_subsampling_factor=3 + +left_context=$[model_left_context + chunk_left_context] +right_context=$[model_right_context + chunk_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +egs_left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +egs_right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +egs_left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +egs_right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +cmvn_opts=`cat $src_dir/cmvn_opts` || exit 1 + +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $src_dir/egs/info/frames_per_eg) + + if [ $stage -le 16 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/aspire-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor $frame_subsampling_factor \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $(dirname $extractor)/ivectors_${supervised_set} \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir $sup_lat_dir $sup_egs_dir + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set} + [ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + if [ $stage -le 17 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/aspire-$(date +'%m_%d_%H_%M')/s5/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + + steps/nnet3/chain/get_egs_split.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + --left-tolerance 1 --right-tolerance 1 \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale 0.5 \ + --lattice-prune-beam 4.0 \ + --deriv-weights-scp $deriv_weights_scp \ + --online-ivector-dir $(dirname $extractor)/ivectors_${unsupervised_set} \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + + touch $unsup_egs_dir/.nodelete + fi +fi + +if [ $stage -le 18 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 64 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. --lang2num-copies "$num_copies" \ + 2 $sup_egs_dir $unsup_egs_dir $dir/egs_comb +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 19 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $(dirname $extractor)/ivectors_${unsupervised_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --egs.stage $get_egs_stage --egs.get-egs-script=$get_egs_script \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true $egs_opts --max-jobs-run $max_jobs_run" \ + --chain.right-tolerance 1 --chain.left-tolerance 1 \ + --chain.alignment-subsampling-factor 1 \ + --egs.chunk-width 160,140,110,80 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir $dir/egs_comb \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph_pp +if [ $stage -le 20 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_pp_test $dir $graph_dir +fi + +if [ $stage -le 21 ]; then +#%WER 27.8 | 2120 27217 | 78.2 13.6 8.2 6.0 27.8 75.9 | -0.613 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iterfinal_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys + local/nnet3/decode.sh --stage $test_stage --decode-num-jobs 30 --affix "$affix" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --window 10 --overlap 5 \ + --sub-speaker-frames 6000 --max-count 75 --ivector-scale 0.75 \ + --pass2-decode-opts "--min-active 1000" \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + dev_aspire_ldc data/lang $dir/graph_pp $dir +fi + +if [ $stage -le 22 ]; then + rm $dir/.error 2>/dev/null || true + + for d in dev_rvb test_rvb; do + ( + if [ ! -f exp/nnet3/ivectors_${d}/ivector_online.scp ]; then + steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \ + data/${d}_hires exp/nnet3/extractor \ + exp/nnet3/ivectors_${d} || { echo "Failed i-vector extraction for data/${d}_hires"; touch $dir/.error; } + fi + + decode_dir=$dir/decode_${d}_pp + steps/nnet3/decode.sh --nj 30 --cmd "$decode_cmd" --config conf/decode.config \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + --frames-per-chunk 160 \ + --online-ivector-dir exp/nnet3/ivectors_${d} \ + $graph_dir data/${d}_hires $decode_dir || { echo "Failed decoding in $decode_dir"; touch $dir/.error; } + ) & + done + wait + + if [ -f $dir/.error ]; then + echo "Failed decoding." + exit 1 + fi +fi + +exit 0 + +#if [ $stage -le 15 ]; then +# #Online decoding example +# %WER 31.5 | 2120 27224 | 74.0 13.0 13.0 5.5 31.5 77.1 | -0.558 | exp/chain/tdnn_7b_online/decode_dev_aspire_whole_uniformsegmented_win10_over5_v9_online_iterfinal_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys + +# local/nnet3/prep_test_aspire_online.sh --stage 2 --decode-num-jobs 30 --affix "v7" \ +# --acwt 1.0 --post-decode-acwt 10.0 \ +# --window 10 --overlap 5 \ +# --max-count 75 \ +# --pass2-decode-opts "--min-active 1000" \ +# dev_aspire data/lang $dir/graph_pp exp/chain/tdnn_7b +#fi + + + + +exit 0; + +# %WER 32.7 | 2120 27222 | 73.6 15.3 11.2 6.3 32.7 78.5 | -0.530 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter100_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 30.4 | 2120 27211 | 74.8 12.7 12.5 5.1 30.4 77.0 | -0.458 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter200_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys +# %WER 29.1 | 2120 27216 | 76.6 13.8 9.6 5.7 29.1 76.8 | -0.527 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter300_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.8 | 2120 27211 | 77.0 13.8 9.2 5.8 28.8 76.3 | -0.587 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter400_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.7 | 2120 27218 | 77.1 13.8 9.1 5.8 28.7 77.0 | -0.566 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter500_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.5 | 2120 27210 | 77.5 13.9 8.7 6.0 28.5 76.1 | -0.596 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter600_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.2 | 2120 27217 | 77.0 12.4 10.6 5.2 28.2 75.8 | -0.540 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter700_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys +# %WER 28.4 | 2120 27218 | 77.6 13.6 8.8 6.0 28.4 76.3 | -0.607 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter800_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.2 | 2120 27208 | 77.4 12.6 10.0 5.6 28.2 76.6 | -0.555 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter900_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys +# %WER 27.8 | 2120 27214 | 78.0 13.5 8.5 5.9 27.8 75.9 | -0.631 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1000_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 27.9 | 2120 27216 | 77.6 13.0 9.4 5.5 27.9 76.1 | -0.544 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1200_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys +# %WER 27.8 | 2120 27216 | 77.4 13.1 9.5 5.3 27.8 75.7 | -0.615 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1300_pp_fg/score_9/penalty_0.25/ctm.filt.filt.sys +# %WER 27.7 | 2120 27220 | 78.1 13.6 8.3 5.8 27.7 75.1 | -0.569 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1400_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 27.7 | 2120 27217 | 78.1 13.6 8.3 5.9 27.7 75.1 | -0.605 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1500_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys diff --git a/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_ts_1c.sh b/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_ts_1c.sh new file mode 100755 index 00000000000..1893fa7772f --- /dev/null +++ b/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_ts_1c.sh @@ -0,0 +1,469 @@ +#!/bin/bash + +set -e + +# configs for 'chain' +affix=v8 + +stage=7 # skip ivector extractor training as it is already done for baseline system +train_stage=-10 +get_egs_stage=-10 +nj=70 +max_jobs_run=30 +test_stage=0 + +exp=exp/semisup300k + +# seed model params +src_dir=exp/semisup300k/chain_norvb/tdnn_lstm_1a_sp +treedir=exp/semisup300k/chain_norvb/tree_bi_a +src_ivector_extractor=exp/nnet3_norvb/extractor + +extractor=exp/nnet3/extractor + +sup_lat_dir=exp/semisup300k/chain/tri5b_train_300k_rvb_sp_lats +supervised_set=train_300k_rvb_sp + +norvb_unsupervised_set=train +unsupervised_set=train_rvb + +tdnn_affix=_1c +chain_affix=_semisup_ts + +kl_factor_schedule="output-0=0,0 output-1=0,0" +mmi_factor_schedule="output-0=1,1 output-1=1,1" + +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +# training options +num_epochs=2 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +label_delay=5 + +lm_weights=1,3 # 3 - To compensate for using alignments before reverberation. +supervision_weights=1.0,1.0 +num_copies=2,1 + +# decode options +extra_left_context=50 +extra_right_context=0 +decode_iter= + +# training options +remove_egs=false +sup_egs_dir= +unsup_egs_dir= +unsup_egs_opts= + +num_data_reps=3 +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat < $unsup_lat_dir/uttlist.$n.$nj + #done + + rm -f $unsup_lat_dir/lat_tmp.*.{ark,scp} 2>/dev/null + + # Copy the lattices temporarily + norvb_nj=$(cat $norvb_unsup_lat_dir/num_jobs) + $train_cmd --max-jobs-run $max_jobs_run JOB=1:$norvb_nj $unsup_lat_dir/log/copy_lattices.JOB.log \ + lattice-copy --write-compact=false "ark:gunzip -c $norvb_unsup_lat_dir/lat.JOB.gz |" \ + ark,scp:$unsup_lat_dir/lat_tmp.JOB.ark,$unsup_lat_dir/lat_tmp.JOB.scp || exit 1 + + # Make copies of utterances for perturbed data + for n in `seq 3`; do + cat $unsup_lat_dir/lat_tmp.*.scp | awk -v n=$n '{print "rev"n"_"$1" "$2}' + done | sort -k1,1 > $unsup_lat_dir/lat_rvb.scp + + # Copy and dump the lattices for perturbed data + $train_cmd --max-jobs-run $max_jobs_run JOB=1:$nj $unsup_lat_dir/log/copy_rvb_lattices.JOB.log \ + lattice-copy --write-compact=false \ + "scp:utils/filter_scp.pl data/${unsupervised_set}_hires/split$nj/JOB/utt2spk $unsup_lat_dir/lat_rvb.scp |" \ + "ark:| gzip -c > $unsup_lat_dir/lat.JOB.gz" || exit 1 + + rm $unsup_lat_dir/lat_tmp.* $unsup_lat_dir/lat_rvb.scp + + echo $nj > $unsup_lat_dir/num_jobs + + for f in cmvn_opts final.mdl splice_opts tree frame_subsampling_factor; do + if [ -f $norvb_unsup_lat_dir/$f ]; then cp $norvb_unsup_lat_dir/$f $unsup_lat_dir/$f; fi + done + + ln -sf ../final.mdl $unsup_lat_dir/final.mdl +fi + +if [ $stage -le 12 ]; then + steps/best_path_weights.sh --cmd "$decode_cmd" \ + ${norvb_train_data_dir} ${norvb_unsup_lat_dir} \ + $src_dir/best_path_${norvb_unsupervised_set} +fi + +if [ $stage -le 13 ]; then + norvb_weights_dir=$src_dir/best_path_${norvb_unsupervised_set} + norvb_nj=$(cat $norvb_weights_dir/num_jobs) + + mkdir -p $src_dir/best_path_${unsupervised_set} + for n in `seq 3`; do + cat $norvb_weights_dir/weights.scp | awk -v n=$n '{print "rev"n"_"$1" "$2}' + done | sort -k1,1 > $src_dir/best_path_${unsupervised_set}/weights.scp +fi + +deriv_weights_scp=$src_dir/best_path_${unsupervised_set}/weights.scp + +if [ $stage -le 14 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${src_dir}/best_path_${norvb_unsupervised_set} $dir +fi + +if [ $stage -le 15 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm4 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm4 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine@$label_delay + output name=output-1 input=output.affine@$label_delay + + output name=output-0-xent input=output-xent.log-softmax@$label_delay + output name=output-1-xent input=output-xent.log-softmax@$label_delay +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +frame_subsampling_factor=3 + +left_context=$[model_left_context + chunk_left_context] +right_context=$[model_right_context + chunk_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +egs_left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +egs_right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +egs_left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +egs_right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +cmvn_opts=`cat $src_dir/cmvn_opts` || exit 1 + +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $src_dir/egs/info/frames_per_eg) + + if [ $stage -le 16 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/aspire-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor $frame_subsampling_factor \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $(dirname $extractor)/ivectors_${supervised_set} \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir $sup_lat_dir $sup_egs_dir + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +lattice_lm_scale=0.5 +kl_fst_scale=0.5 + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set} + [ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + if [ $stage -le 17 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/aspire-$(date +'%m_%d_%H_%M')/s5/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + + steps/nnet3/chain/get_egs_split.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + --left-tolerance 1 --right-tolerance 1 \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam 4.0 --kl-latdir $unsup_lat_dir --kl-fst-scale $kl_fst_scale \ + --deriv-weights-scp $deriv_weights_scp \ + --online-ivector-dir $(dirname $extractor)/ivectors_${unsupervised_set} \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + + touch $unsup_egs_dir/.nodelete + fi +fi + +if [ $stage -le 18 ]; then + steps/nnet3/chain/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --block-size 128 \ + --lang2weight $supervision_weights --lang2num-copies "$num_copies" \ + 2 $sup_egs_dir $unsup_egs_dir $dir/egs_comb +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 19 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $(dirname $extractor)/ivectors_${unsupervised_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --chain.kl-factor-schedule "$kl_factor_schedule" \ + --chain.mmi-factor-schedule "$mmi_factor_schedule" \ + --egs.stage $get_egs_stage --egs.get-egs-script=$get_egs_script \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true $egs_opts --max-jobs-run $max_jobs_run" \ + --chain.right-tolerance 1 --chain.left-tolerance 1 \ + --chain.alignment-subsampling-factor 1 \ + --egs.chunk-width 160,140,110,80 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir $dir/egs_comb \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph_pp +if [ $stage -le 20 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_pp_test $dir $graph_dir +fi + +if [ $stage -le 21 ]; then + rm $dir/.error 2>/dev/null || true + + for d in dev_rvb test_rvb; do + ( + if [ ! -f exp/nnet3/ivectors_${d}/ivector_online.scp ]; then + steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \ + data/${d}_hires exp/nnet3/extractor \ + exp/nnet3/ivectors_${d} || { echo "Failed i-vector extraction for data/${d}_hires"; touch $dir/.error; } + fi + + decode_dir=$dir/decode_${d}_pp + steps/nnet3/decode.sh --nj 30 --cmd "$decode_cmd" --config conf/decode.config \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + --frames-per-chunk 160 \ + --online-ivector-dir exp/nnet3/ivectors_${d} \ + $graph_dir data/${d}_hires $decode_dir || { echo "Failed decoding in $decode_dir"; touch $dir/.error; } + ) & + done + wait + + if [ -f $dir/.error ]; then + echo "Failed decoding." + exit 1 + fi + + exit 1 +fi + +if [ $stage -le 22 ]; then +#%WER 27.8 | 2120 27217 | 78.2 13.6 8.2 6.0 27.8 75.9 | -0.613 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iterfinal_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys + local/nnet3/decode.sh --stage $test_stage --decode-num-jobs 30 --affix "$affix" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --window 10 --overlap 5 \ + --sub-speaker-frames 6000 --max-count 75 --ivector-scale 0.75 \ + --pass2-decode-opts "--min-active 1000" \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + dev_aspire_ldc data/lang $dir/graph_pp $dir +fi +exit 0 + +#if [ $stage -le 15 ]; then +# #Online decoding example +# %WER 31.5 | 2120 27224 | 74.0 13.0 13.0 5.5 31.5 77.1 | -0.558 | exp/chain/tdnn_7b_online/decode_dev_aspire_whole_uniformsegmented_win10_over5_v9_online_iterfinal_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys + +# local/nnet3/prep_test_aspire_online.sh --stage 2 --decode-num-jobs 30 --affix "v7" \ +# --acwt 1.0 --post-decode-acwt 10.0 \ +# --window 10 --overlap 5 \ +# --max-count 75 \ +# --pass2-decode-opts "--min-active 1000" \ +# dev_aspire data/lang $dir/graph_pp exp/chain/tdnn_7b +#fi + + + + +exit 0; + +# %WER 32.7 | 2120 27222 | 73.6 15.3 11.2 6.3 32.7 78.5 | -0.530 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter100_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 30.4 | 2120 27211 | 74.8 12.7 12.5 5.1 30.4 77.0 | -0.458 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter200_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys +# %WER 29.1 | 2120 27216 | 76.6 13.8 9.6 5.7 29.1 76.8 | -0.527 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter300_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.8 | 2120 27211 | 77.0 13.8 9.2 5.8 28.8 76.3 | -0.587 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter400_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.7 | 2120 27218 | 77.1 13.8 9.1 5.8 28.7 77.0 | -0.566 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter500_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.5 | 2120 27210 | 77.5 13.9 8.7 6.0 28.5 76.1 | -0.596 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter600_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.2 | 2120 27217 | 77.0 12.4 10.6 5.2 28.2 75.8 | -0.540 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter700_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys +# %WER 28.4 | 2120 27218 | 77.6 13.6 8.8 6.0 28.4 76.3 | -0.607 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter800_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.2 | 2120 27208 | 77.4 12.6 10.0 5.6 28.2 76.6 | -0.555 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter900_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys +# %WER 27.8 | 2120 27214 | 78.0 13.5 8.5 5.9 27.8 75.9 | -0.631 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1000_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 27.9 | 2120 27216 | 77.6 13.0 9.4 5.5 27.9 76.1 | -0.544 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1200_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys +# %WER 27.8 | 2120 27216 | 77.4 13.1 9.5 5.3 27.8 75.7 | -0.615 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1300_pp_fg/score_9/penalty_0.25/ctm.filt.filt.sys +# %WER 27.7 | 2120 27220 | 78.1 13.6 8.3 5.8 27.7 75.1 | -0.569 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1400_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 27.7 | 2120 27217 | 78.1 13.6 8.3 5.9 27.7 75.1 | -0.605 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1500_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys diff --git a/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_ts_1d.sh b/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_ts_1d.sh new file mode 100755 index 00000000000..ebe52330554 --- /dev/null +++ b/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_ts_1d.sh @@ -0,0 +1,484 @@ +#!/bin/bash + +set -e + +# configs for 'chain' +affix=v8 + +stage=7 # skip ivector extractor training as it is already done for baseline system +train_stage=-10 +get_egs_stage=-10 +nj=70 +max_jobs_run=30 +test_stage=0 + +exp=exp/semisup300k + +# seed model params +src_dir=exp/semisup300k/chain_norvb/tdnn_lstm_1a_sp +treedir=exp/semisup300k/chain_norvb/tree_bi_a +src_ivector_extractor=exp/nnet3_norvb/extractor + +extractor=exp/nnet3/extractor + +sup_lat_dir=exp/semisup300k/chain/tri5b_train_300k_rvb_sp_lats +supervised_set=train_300k_rvb_sp + +norvb_unsupervised_set=train +unsupervised_set=train_rvb + +tdnn_affix=_1c +chain_affix=_semisup_ts + +kl_factor_schedule="output-0=0,0 output-1=0,0" +mmi_factor_schedule="output-0=1,1 output-1=1,1" + +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +# training options +num_epochs=2 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +label_delay=5 + +lm_weights=1,3 # 3 - To compensate for using alignments before reverberation. +supervision_weights=1.0,1.0 +num_copies=2,1 + +# decode options +extra_left_context=50 +extra_right_context=0 +decode_iter= + +# training options +remove_egs=false +sup_egs_dir= +unsup_egs_dir= +unsup_egs_opts= + +num_data_reps=3 +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat < $src_dir/best_path_${unsupervised_set}/weights.scp +fi + +deriv_weights_scp=$src_dir/best_path_${unsupervised_set}/weights.scp + +kl_decode_graph_dir=$src_dir/graph${kl_decode_graph_affix} + +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 ${kl_decode_lang} $src_dir $kl_decode_graph_dir +fi + +if [ $stage -le 15 ]; then + steps/nnet3/decode_semisup.sh --sub-split $nj --nj $nj --cmd "$decode_cmd" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --write-compact true --word-determinize false \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk 160 \ + --online-ivector-dir $unsup_src_ivector_dir \ + --skip-scoring true \ + $kl_decode_graph_dir data/${norvb_unsupervised_set}_hires $norvb_unsup_kl_lat_dir || exit 1 +fi + +if [ $stage -le 16 ]; then + utt_prefixes= + for n in $(seq $num_data_reps); do + utt_prefixes="$utt_prefixes rev${n}_" + done + + local/semisup/copy_lat_dir.sh --write-compact true \ + --nj $nj --utt_prefixes "$utt_prefixes" \ + data/${unsupervised_set}_hires \ + ${norvb_unsup_kl_lat_dir} ${unsup_kl_lat_dir} + + ln -sf ../final.mdl $unsup_kl_lat_dir/final.mdl +fi + +if [ $stage -le 17 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${src_dir}/best_path_${norvb_unsupervised_set} $dir +fi + +if [ $stage -le 18 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm4 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm4 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine@$label_delay + output name=output-1 input=output.affine@$label_delay + + output name=output-0-xent input=output-xent.log-softmax@$label_delay + output name=output-1-xent input=output-xent.log-softmax@$label_delay +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +frame_subsampling_factor=3 + +left_context=$[model_left_context + chunk_left_context] +right_context=$[model_right_context + chunk_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +egs_left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +egs_right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +egs_left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +egs_right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +cmvn_opts=`cat $src_dir/cmvn_opts` || exit 1 + +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $src_dir/egs/info/frames_per_eg) + + if [ $stage -le 19 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/aspire-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor $frame_subsampling_factor \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $(dirname $extractor)/ivectors_${supervised_set} \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir $sup_lat_dir $sup_egs_dir + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +lattice_lm_scale=0.5 +kl_fst_scale=0.5 + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set} + [ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + if [ $stage -le 20 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/aspire-$(date +'%m_%d_%H_%M')/s5/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + + steps/nnet3/chain/get_egs_split.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + --left-tolerance 1 --right-tolerance 1 \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam 4.0 --kl-latdir $unsup_kl_lat_dir --kl-fst-scale $kl_fst_scale \ + --deriv-weights-scp $deriv_weights_scp \ + --online-ivector-dir $(dirname $extractor)/ivectors_${unsupervised_set} \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + + touch $unsup_egs_dir/.nodelete + fi +fi + +if [ $stage -le 21 ]; then + steps/nnet3/chain/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --block-size 128 \ + --lang2weight $supervision_weights --lang2num-copies "$num_copies" \ + 2 $sup_egs_dir $unsup_egs_dir $dir/egs_comb +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 22 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $(dirname $extractor)/ivectors_${unsupervised_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --chain.kl-factor-schedule "$kl_factor_schedule" \ + --chain.mmi-factor-schedule "$mmi_factor_schedule" \ + --egs.stage $get_egs_stage --egs.get-egs-script=$get_egs_script \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true $egs_opts --max-jobs-run $max_jobs_run" \ + --chain.right-tolerance 1 --chain.left-tolerance 1 \ + --chain.alignment-subsampling-factor 1 \ + --egs.chunk-width 160,140,110,80 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir $dir/egs_comb \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph_pp +if [ $stage -le 23 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_pp_test $dir $graph_dir +fi + +if [ $stage -le 24 ]; then + rm $dir/.error 2>/dev/null || true + + for d in dev_rvb test_rvb; do + ( + if [ ! -f exp/nnet3/ivectors_${d}/ivector_online.scp ]; then + steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \ + data/${d}_hires exp/nnet3/extractor \ + exp/nnet3/ivectors_${d} || { echo "Failed i-vector extraction for data/${d}_hires"; touch $dir/.error; } + fi + + decode_dir=$dir/decode_${d}_pp + steps/nnet3/decode.sh --nj 30 --cmd "$decode_cmd" --config conf/decode.config \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + --frames-per-chunk 160 \ + --online-ivector-dir exp/nnet3/ivectors_${d} \ + $graph_dir data/${d}_hires $decode_dir || { echo "Failed decoding in $decode_dir"; touch $dir/.error; } + ) & + done + wait + + if [ -f $dir/.error ]; then + echo "Failed decoding." + exit 1 + fi + + exit 1 +fi + +if [ $stage -le 25 ]; then +#%WER 27.8 | 2120 27217 | 78.2 13.6 8.2 6.0 27.8 75.9 | -0.613 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iterfinal_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys + local/nnet3/decode.sh --stage $test_stage --decode-num-jobs 30 --affix "$affix" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --window 10 --overlap 5 \ + --sub-speaker-frames 6000 --max-count 75 --ivector-scale 0.75 \ + --pass2-decode-opts "--min-active 1000" \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + dev_aspire_ldc data/lang $dir/graph_pp $dir +fi +exit 0 + +#if [ $stage -le 15 ]; then +# #Online decoding example +# %WER 31.5 | 2120 27224 | 74.0 13.0 13.0 5.5 31.5 77.1 | -0.558 | exp/chain/tdnn_7b_online/decode_dev_aspire_whole_uniformsegmented_win10_over5_v9_online_iterfinal_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys + +# local/nnet3/prep_test_aspire_online.sh --stage 2 --decode-num-jobs 30 --affix "v7" \ +# --acwt 1.0 --post-decode-acwt 10.0 \ +# --window 10 --overlap 5 \ +# --max-count 75 \ +# --pass2-decode-opts "--min-active 1000" \ +# dev_aspire data/lang $dir/graph_pp exp/chain/tdnn_7b +#fi + + + + +exit 0; + +# %WER 32.7 | 2120 27222 | 73.6 15.3 11.2 6.3 32.7 78.5 | -0.530 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter100_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 30.4 | 2120 27211 | 74.8 12.7 12.5 5.1 30.4 77.0 | -0.458 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter200_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys +# %WER 29.1 | 2120 27216 | 76.6 13.8 9.6 5.7 29.1 76.8 | -0.527 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter300_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.8 | 2120 27211 | 77.0 13.8 9.2 5.8 28.8 76.3 | -0.587 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter400_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.7 | 2120 27218 | 77.1 13.8 9.1 5.8 28.7 77.0 | -0.566 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter500_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.5 | 2120 27210 | 77.5 13.9 8.7 6.0 28.5 76.1 | -0.596 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter600_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.2 | 2120 27217 | 77.0 12.4 10.6 5.2 28.2 75.8 | -0.540 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter700_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys +# %WER 28.4 | 2120 27218 | 77.6 13.6 8.8 6.0 28.4 76.3 | -0.607 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter800_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.2 | 2120 27208 | 77.4 12.6 10.0 5.6 28.2 76.6 | -0.555 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter900_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys +# %WER 27.8 | 2120 27214 | 78.0 13.5 8.5 5.9 27.8 75.9 | -0.631 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1000_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 27.9 | 2120 27216 | 77.6 13.0 9.4 5.5 27.9 76.1 | -0.544 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1200_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys +# %WER 27.8 | 2120 27216 | 77.4 13.1 9.5 5.3 27.8 75.7 | -0.615 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1300_pp_fg/score_9/penalty_0.25/ctm.filt.filt.sys +# %WER 27.7 | 2120 27220 | 78.1 13.6 8.3 5.8 27.7 75.1 | -0.569 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1400_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 27.7 | 2120 27217 | 78.1 13.6 8.3 5.9 27.7 75.1 | -0.605 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1500_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys diff --git a/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_ts_ami_1a.sh b/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_ts_ami_1a.sh new file mode 100755 index 00000000000..3bb40618983 --- /dev/null +++ b/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_ts_ami_1a.sh @@ -0,0 +1,292 @@ +#!/bin/bash + +set -e -o pipefail -u + +# configs for 'chain' +stage=0 +train_stage=-10 +get_egs_stage=-10 +nj=70 +max_jobs_run=30 + +# seed model params +src_dir=exp/semisup300k/chain/tdnn_lstm_1b_sp +treedir=exp/semisup300k/chain/tree_bi_b +src_ivector_extractor=exp/nnet3/extractor + +tgt_data_dir=data/ami_sdm1_train + +student_mfcc_config=conf/mfcc_hires_16kHz.conf + +student_graph_affix=_pp +student_lang=data/lang_pp_test +student_rescore_lang=data/lang_pp_test_fg + +tgt_graph_affix=_ami +tgt_lang=data/lang_ami + +tdnn_affix=_1a +chain_affix=_semisup_ts_ami_sdm1 +nnet3_affix=_semisup_ts_ami_sdm1 + +hidden_dim=512 +cell_dim=512 +projection_dim=128 + +# training options +num_epochs=2 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +label_delay=5 + +remove_egs=false +common_egs_dir= + +# decode options +test_sets="ami_sdm1_dev_16kHz ami_sdm1_eval_16kHz" + +scoring_script=local/score.sh + +extra_left_context=50 +extra_right_context=0 + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm4 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm4 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine@$label_delay skip-in-init=true + output name=output-1 input=output.affine@$label_delay skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax@$label_delay skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax@$label_delay skip-in-init=true +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +egs_opts="--lattice-lm-scale 0.5 --lattice-prune-beam 4.0 --deriv-weights-scp $deriv_weights_scp" + +if [ $stage -le 13 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_$(basename $student_data_dir) \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --egs.stage $get_egs_stage --egs.get-egs-script "steps/nnet3/chain/get_egs_split.sh" \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true $egs_opts --max-jobs-run $max_jobs_run" \ + --chain.right-tolerance 1 --chain.left-tolerance 1 \ + --chain.alignment-subsampling-factor 1 \ + --egs.chunk-width 160,140,110,80 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $student_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph${tgt_graph_affix} +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 ${tgt_lang} $dir $graph_dir +fi + +if [ $stage -le 15 ]; then + for dset in $test_sets; do + ( + decode_dir=$dir/decode${tgt_graph_affix}_${dset} + + steps/nnet3/decode.sh --nj $nj --cmd "$decode_cmd" --config conf/decode.config \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + --frames-per-chunk 160 \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset} \ + --skip-scoring true \ + $graph_dir data/${dset}_hires $decode_dir || { echo "Failed decoding in $decode_dir"; touch $dir/.error; } + + $scoring_script --cmd "$decode_cmd" \ + data/${dset}_hires $graph_dir $decode_dir + ) & + done + wait + + if [ -f $dir/.error ]; then + echo "Failed decoding." + exit 1 + fi +fi diff --git a/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_ts_ami_1b.sh b/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_ts_ami_1b.sh new file mode 100755 index 00000000000..97bf6950664 --- /dev/null +++ b/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_ts_ami_1b.sh @@ -0,0 +1,292 @@ +#!/bin/bash + +# This script does MMI + KL training. +# This script is similar to _a, but uses one less group of +# TDNN + LSTM layer + +set -e -o pipefail -u + +# configs for 'chain' +stage=0 +train_stage=-10 +get_egs_stage=-10 +nj=70 +max_jobs_run=30 + +# seed model params +src_dir=exp/semisup300k/chain/tdnn_lstm_1b_sp +treedir=exp/semisup300k/chain/tree_bi_b +src_ivector_extractor=exp/nnet3/extractor + +tgt_data_dir=data/ami_sdm1_train + +student_mfcc_config=conf/mfcc_hires_16kHz.conf + +student_graph_affix=_pp +student_lang=data/lang_pp_test + +tgt_graph_affix=_ami +tgt_lang=data/lang_ami + +tdnn_affix=_1b +chain_affix=_semisup_ts_ami_sdm1 +nnet3_affix=_semisup_ts_ami_sdm1 + +kl_factor_schedule="output=0.0,0.0" +mmi_factor_schedule="output=1.0,1.0" + +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +# training options +num_epochs=2 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +label_delay=5 + +remove_egs=false +common_egs_dir= + +egs_opts="--lattice-lm-scale 0.5 --lattice-prune-beam 4.0" +train_opts= + +# decode options +test_sets="ami_sdm1_dev_16kHz ami_sdm1_eval_16kHz" + +scoring_script=local/score.sh + +extra_left_context=50 +extra_right_context=0 + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +egs_opts="$egs_opts --deriv-weights-scp $deriv_weights_scp $egs_opts" + +if [ $stage -le 13 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_$(basename $student_data_dir) \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --egs.stage $get_egs_stage --egs.get-egs-script "steps/nnet3/chain/get_egs_split.sh" \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true $egs_opts --max-jobs-run $max_jobs_run" \ + --chain.right-tolerance 1 --chain.left-tolerance 1 \ + --chain.alignment-subsampling-factor 1 \ + --egs.chunk-width 160,140,110,80 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $student_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir $train_opts || exit 1; +fi + +graph_dir=$dir/graph${tgt_graph_affix} +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 ${tgt_lang} $dir $graph_dir +fi + +if [ $stage -le 15 ]; then + for dset in $test_sets; do + ( + decode_dir=$dir/decode${tgt_graph_affix}_${dset} + + steps/nnet3/decode.sh --nj $nj --cmd "$decode_cmd" --config conf/decode.config \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + --frames-per-chunk 160 \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset} \ + --skip-scoring true \ + $graph_dir data/${dset}_hires $decode_dir || { echo "Failed decoding in $decode_dir"; touch $dir/.error; } + + $scoring_script --cmd "$decode_cmd" \ + data/${dset}_hires $graph_dir $decode_dir + ) & + done + wait + + if [ -f $dir/.error ]; then + echo "Failed decoding." + exit 1 + fi +fi diff --git a/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_ts_ami_1c.sh b/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_ts_ami_1c.sh new file mode 100755 index 00000000000..164fa929052 --- /dev/null +++ b/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_ts_ami_1c.sh @@ -0,0 +1,333 @@ +#!/bin/bash + +# This script does MMI + KL training. +# This script is similar to _b, but supports using different lattices +# for KL training, usually generated using a unigram LM. +set -e -o pipefail -u + +# configs for 'chain' +stage=0 +train_stage=-10 +get_egs_stage=-10 +nj=70 +max_jobs_run=30 + +# seed model params +src_dir=exp/semisup300k/chain/tdnn_lstm_1b_sp +treedir=exp/semisup300k/chain/tree_bi_b +src_ivector_extractor=exp/nnet3/extractor + +tgt_data_dir=data/ami_sdm1_train + +student_mfcc_config=conf/mfcc_hires_16kHz.conf + +student_graph_affix=_pp +student_lang=data/lang_pp_test +kl_student_graph_affix=_pp +kl_student_lang=data/lang_pp_test + +tgt_graph_affix=_ami +tgt_lang=data/lang_ami + +lm_weights=1,0 # src, tgt weight + +tdnn_affix=_1b +chain_affix=_semisup_ts_ami_sdm1 +nnet3_affix=_semisup_ts_ami_sdm1 + +kl_factor_schedule="output=0.0,0.0" +mmi_factor_schedule="output=1.0,1.0" + +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +# training options +num_epochs=2 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +label_delay=5 + +remove_egs=false +common_egs_dir= + +kl_fst_scale=0.5 +egs_opts="--lattice-lm-scale 0.5 --lattice-prune-beam 4.0" +train_opts= + +# decode options +test_sets="ami_sdm1_dev_16kHz ami_sdm1_eval_16kHz" + +scoring_script=local/score.sh + +extra_left_context=50 +extra_right_context=0 + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +egs_opts="$egs_opts --deriv-weights-scp $deriv_weights_scp --kl-latdir $kl_lat_dir --kl-fst-scale $kl_fst_scale" + +if [ $stage -le 15 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights \ + --cmd "$train_cmd" \ + $treedir $src_dir/best_path${student_graph_affix}_${tgt_dataset}_sp \ + $dir +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 16 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$train_cmd --mem 4G" --combine-queue-opt "--h-rt 00:59:00" --train-queue-opt "--h-rt 00:15:00" \ + --feat.online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_$(basename $student_data_dir) \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --chain.kl-factor-schedule "$kl_factor_schedule" \ + --chain.mmi-factor-schedule "$mmi_factor_schedule" \ + --egs.stage $get_egs_stage --egs.get-egs-script "steps/nnet3/chain/get_egs_split.sh" \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true $egs_opts --max-jobs-run $max_jobs_run" \ + --chain.right-tolerance 1 --chain.left-tolerance 1 \ + --chain.alignment-subsampling-factor 1 \ + --egs.chunk-width 160,140,110,80 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $student_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir $train_opts || exit 1; +fi + +graph_dir=$dir/graph${tgt_graph_affix} +if [ $stage -le 16 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 ${tgt_lang} $dir $graph_dir +fi + +if [ $stage -le 17 ]; then + for dset in $test_sets; do + ( + decode_dir=$dir/decode${tgt_graph_affix}_${dset} + + steps/nnet3/decode.sh --nj $nj --cmd "$decode_cmd" --config conf/decode.config \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + --frames-per-chunk 160 \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset} \ + --skip-scoring true \ + $graph_dir data/${dset}_hires $decode_dir || { echo "Failed decoding in $decode_dir"; touch $dir/.error; } + + $scoring_script --cmd "$decode_cmd" \ + data/${dset}_hires $graph_dir $decode_dir + ) & + done + wait + + if [ -f $dir/.error ]; then + echo "Failed decoding." + exit 1 + fi +fi diff --git a/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_ts_ami_1d.sh b/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_ts_ami_1d.sh new file mode 100755 index 00000000000..e59e26c1fb6 --- /dev/null +++ b/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_ts_ami_1d.sh @@ -0,0 +1,331 @@ +#!/bin/bash + +# This script does MMI + KL training. +# This script supports using different lattices +# for KL training, usually generated using a unigram LM. +# This script is similar to _c, but updates existing teacher model instead +# of training from scratch. + +set -e -o pipefail -u + +# configs for 'chain' +stage=0 +train_stage=-10 +get_egs_stage=-10 +nj=70 +max_jobs_run=30 + +# seed model params +src_dir=exp/semisup300k/chain/tdnn_lstm_1b_sp +treedir=exp/semisup300k/chain/tree_bi_b +src_ivector_extractor=exp/nnet3/extractor + +tgt_data_dir=data/ami_sdm1_train + +student_mfcc_config=conf/mfcc_hires_16kHz.conf + +student_graph_affix=_pp +student_lang=data/lang_pp_test +kl_student_graph_affix=_pp +kl_student_lang=data/lang_pp_test + +tgt_graph_affix=_ami +tgt_lang=data/lang_ami + +tdnn_affix=_1c +chain_affix=_semisup_ts_ami_sdm1 +nnet3_affix=_semisup_ts_ami_sdm1 + +kl_factor_schedule="output=0.0,0.0" +mmi_factor_schedule="output=1.0,1.0" + +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +# training options +num_epochs=2 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +label_delay=5 + +remove_egs=false +common_egs_dir= + +kl_fst_scale=0.5 +egs_opts="--lattice-lm-scale 0.5 --lattice-prune-beam 4.0" +train_opts= + +# decode options +test_sets="ami_sdm1_dev_16kHz ami_sdm1_eval_16kHz" + +scoring_script=local/score.sh + +extra_left_context=50 +extra_right_context=0 + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +primary_lr_factor=0.1 + +if [ $stage -le 14 ]; then + $train_cmd $dir/log/generate_input_mdl.log \ + nnet3-copy --edits="set-learning-rate-factor name=* learning-rate-factor=$primary_lr_factor; set-learning-rate-factor name=output* learning-rate-factor=1.0" \ + $src_dir/final.mdl $dir/input.raw || exit 1; +fi + +egs_opts="$egs_opts --deriv-weights-scp $deriv_weights_scp --kl-latdir $kl_lat_dir --kl-fst-scale $kl_fst_scale" + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" --trainer.input-model $dir/input.raw \ + --feat.online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_$(basename $student_data_dir) \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --chain.kl-factor-schedule "$kl_factor_schedule" \ + --chain.mmi-factor-schedule "$mmi_factor_schedule" \ + --egs.stage $get_egs_stage --egs.get-egs-script "steps/nnet3/chain/get_egs_split.sh" \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true $egs_opts --max-jobs-run $max_jobs_run" \ + --chain.right-tolerance 1 --chain.left-tolerance 1 \ + --chain.alignment-subsampling-factor 1 \ + --egs.chunk-width 160,140,110,80 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $student_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir $train_opts || exit 1; +fi + +graph_dir=$dir/graph${tgt_graph_affix} +if [ $stage -le 16 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 ${tgt_lang} $dir $graph_dir +fi + +if [ $stage -le 17 ]; then + for dset in $test_sets; do + ( + decode_dir=$dir/decode${tgt_graph_affix}_${dset} + + steps/nnet3/decode.sh --nj $nj --cmd "$decode_cmd" --config conf/decode.config \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + --frames-per-chunk 160 \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset} \ + --skip-scoring true \ + $graph_dir data/${dset}_hires $decode_dir || { echo "Failed decoding in $decode_dir"; touch $dir/.error; } + + $scoring_script --cmd "$decode_cmd" \ + data/${dset}_hires $graph_dir $decode_dir + ) & + done + wait + + if [ -f $dir/.error ]; then + echo "Failed decoding." + exit 1 + fi +fi diff --git a/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_ts_ami_1e.sh b/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_ts_ami_1e.sh new file mode 100755 index 00000000000..5a3b7a3f0e8 --- /dev/null +++ b/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_ts_ami_1e.sh @@ -0,0 +1,328 @@ +#!/bin/bash + +# This script does MMI + KL training. +# This script is similar to _d, but uses phone LM graph to compute +# numerator posteriors for KL objective. This script does weights +# transfer to update the nnet to target domain. + +set -e -o pipefail -u + +# configs for 'chain' +stage=0 +train_stage=-10 +get_egs_stage=-10 +nj=70 +max_jobs_run=30 + +# seed model params +src_dir=exp/semisup300k/chain/tdnn_lstm_1b_sp +treedir=exp/semisup300k/chain/tree_bi_b +src_ivector_extractor=exp/nnet3/extractor + +tgt_data_dir=data/ami_sdm1_train + +student_mfcc_config=conf/mfcc_hires_16kHz.conf + +student_graph_affix=_pp +student_lang=data/lang_pp_test + +tgt_graph_affix=_ami +tgt_lang=data/lang_ami + +lm_weights=1,0 # src, tgt weight + +tdnn_affix=_1e +chain_affix=_semisup_ts_ami_sdm1 +nnet3_affix=_semisup_ts_ami_sdm1 + +kl_factor_schedule="output=0.0,0.0" +mmi_factor_schedule="output=1.0,1.0" + +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +# training options +num_epochs=2 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +label_delay=5 + +remove_egs=false +common_egs_dir= + +egs_opts="--lattice-lm-scale 0.5 --lattice-prune-beam 4.0" +train_opts= + +# decode options +test_sets="ami_sdm1_dev_16kHz ami_sdm1_eval_16kHz" + +scoring_script=local/score.sh + +extra_left_context=50 +extra_right_context=0 + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +primary_lr_factor=0.1 + +if [ $stage -le 14 ]; then + $train_cmd $dir/log/generate_input_mdl.log \ + nnet3-copy --edits="set-learning-rate-factor name=* learning-rate-factor=$primary_lr_factor; set-learning-rate-factor name=output* learning-rate-factor=1.0" \ + $src_dir/final.mdl $dir/input.raw || exit 1; +fi + +egs_opts="$egs_opts --deriv-weights-scp $deriv_weights_scp --graph-posterior-rspecifier scp:$graph_post_dir/numerator_post.scp" + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" --trainer.input-model $dir/input.raw \ + --feat.online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_$(basename $student_data_dir) \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --chain.kl-factor-schedule "$kl_factor_schedule" \ + --chain.mmi-factor-schedule "$mmi_factor_schedule" \ + --egs.stage $get_egs_stage --egs.get-egs-script "steps/nnet3/chain/get_egs_split.sh" \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true $egs_opts --max-jobs-run $max_jobs_run" \ + --chain.right-tolerance 1 --chain.left-tolerance 1 \ + --chain.alignment-subsampling-factor 1 \ + --egs.chunk-width 160,140,110,80 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $student_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir $train_opts || exit 1; +fi + +graph_dir=$dir/graph${tgt_graph_affix} +if [ $stage -le 16 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 ${tgt_lang} $dir $graph_dir +fi + +if [ $stage -le 17 ]; then + for dset in $test_sets; do + ( + decode_dir=$dir/decode${tgt_graph_affix}_${dset} + + steps/nnet3/decode.sh --nj $nj --cmd "$decode_cmd" --config conf/decode.config \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + --frames-per-chunk 160 \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset} \ + --skip-scoring true \ + $graph_dir data/${dset}_hires $decode_dir || { echo "Failed decoding in $decode_dir"; touch $dir/.error; } + + $scoring_script --cmd "$decode_cmd" \ + data/${dset}_hires $graph_dir $decode_dir + ) & + done + wait + + if [ -f $dir/.error ]; then + echo "Failed decoding." + exit 1 + fi +fi diff --git a/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_ts_ami_1f.sh b/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_ts_ami_1f.sh new file mode 100755 index 00000000000..e29f20d3b46 --- /dev/null +++ b/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_ts_ami_1f.sh @@ -0,0 +1,323 @@ +#!/bin/bash + +# This script does MMI + KL training. +# This script uses phone LM graph to compute numerator posteriors for +# KL objective. +# This script is same as _e, but trains neural network from scratch. + +set -e -o pipefail -u + +# configs for 'chain' +stage=0 +train_stage=-10 +get_egs_stage=-10 +nj=70 +max_jobs_run=30 + +# seed model params +src_dir=exp/semisup300k/chain/tdnn_lstm_1b_sp +treedir=exp/semisup300k/chain/tree_bi_b +src_ivector_extractor=exp/nnet3/extractor + +tgt_data_dir=data/ami_sdm1_train + +student_mfcc_config=conf/mfcc_hires_16kHz.conf + +student_graph_affix=_pp +student_lang=data/lang_pp_test + +tgt_graph_affix=_ami +tgt_lang=data/lang_ami + +lm_weights=1,0 # src, tgt weight + +tdnn_affix=_1f +chain_affix=_semisup_ts_ami_sdm1 +nnet3_affix=_semisup_ts_ami_sdm1 + +kl_factor_schedule="output=0.0,0.0" +mmi_factor_schedule="output=1.0,1.0" + +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +# training options +num_epochs=2 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +label_delay=5 + +remove_egs=false +common_egs_dir= + +egs_opts="--lattice-lm-scale 0.5 --lattice-prune-beam 4.0" +train_opts= + +# decode options +test_sets="ami_sdm1_dev_16kHz ami_sdm1_eval_16kHz" + +scoring_script=local/score.sh + +extra_left_context=50 +extra_right_context=0 + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm4 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm4 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +egs_opts="$egs_opts --deriv-weights-scp $deriv_weights_scp --graph-posterior-rspecifier scp:$graph_post_dir/numerator_post.scp" + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_$(basename $student_data_dir) \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --chain.kl-factor-schedule "$kl_factor_schedule" \ + --chain.mmi-factor-schedule "$mmi_factor_schedule" \ + --egs.stage $get_egs_stage --egs.get-egs-script "steps/nnet3/chain/get_egs_split.sh" \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true $egs_opts --max-jobs-run $max_jobs_run" \ + --chain.right-tolerance 1 --chain.left-tolerance 1 \ + --chain.alignment-subsampling-factor 1 \ + --egs.chunk-width 160,140,110,80 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $student_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir $train_opts || exit 1; +fi + +graph_dir=$dir/graph${tgt_graph_affix} +if [ $stage -le 16 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 ${tgt_lang} $dir $graph_dir +fi + +if [ $stage -le 17 ]; then + for dset in $test_sets; do + ( + decode_dir=$dir/decode${tgt_graph_affix}_${dset} + + steps/nnet3/decode.sh --nj $nj --cmd "$decode_cmd" --config conf/decode.config \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + --frames-per-chunk 160 \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset} \ + --skip-scoring true \ + $graph_dir data/${dset}_hires $decode_dir || { echo "Failed decoding in $decode_dir"; touch $dir/.error; } + + $scoring_script --cmd "$decode_cmd" \ + data/${dset}_hires $graph_dir $decode_dir + ) & + done + wait + + if [ -f $dir/.error ]; then + echo "Failed decoding." + exit 1 + fi +fi diff --git a/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_ts_ami_1g.sh b/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_ts_ami_1g.sh new file mode 100755 index 00000000000..4f862f62a2d --- /dev/null +++ b/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_ts_ami_1g.sh @@ -0,0 +1,323 @@ +#!/bin/bash + +# This script does MMI + KL training. +# This script uses phone LM graph to compute numerator posteriors for +# KL objective. +# This script is same as _e, but trains neural network from scratch. + +set -e -o pipefail -u + +# configs for 'chain' +stage=0 +train_stage=-10 +get_egs_stage=-10 +nj=70 +max_jobs_run=30 + +# seed model params +src_dir=exp/semisup300k/chain/tdnn_lstm_1b_sp +treedir=exp/semisup300k/chain/tree_bi_b +src_ivector_extractor=exp/nnet3/extractor + +tgt_data_dir=data/ami_sdm1_train + +student_mfcc_config=conf/mfcc_hires_16kHz.conf + +student_graph_affix=_pp +student_lang=data/lang_pp_test + +tgt_graph_affix=_ami +tgt_lang=data/lang_ami + +lm_weights=1,0 # src, tgt weight + +tdnn_affix=_1g +chain_affix=_semisup_ts_ami_sdm1 +nnet3_affix=_semisup_ts_ami_sdm1 + +kl_factor_schedule="output=0.0,0.0" +mmi_factor_schedule="output=1.0,1.0" + +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +# training options +num_epochs=2 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +label_delay=5 + +remove_egs=false +common_egs_dir= + +egs_opts="--lattice-lm-scale 0.5 --lattice-prune-beam 4.0" +train_opts= + +# decode options +test_sets="ami_sdm1_dev_16kHz ami_sdm1_eval_16kHz" + +scoring_script=local/score.sh + +extra_left_context=50 +extra_right_context=0 + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm4 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm4 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +egs_opts="$egs_opts --deriv-weights-scp $deriv_weights_scp --graph-posterior-rspecifier scp:$graph_post_dir/numerator_post.scp" + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_$(basename $student_data_dir) \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --chain.kl-factor-schedule "$kl_factor_schedule" \ + --chain.mmi-factor-schedule "$mmi_factor_schedule" \ + --egs.stage $get_egs_stage --egs.get-egs-script "steps/nnet3/chain/get_egs_split.sh" \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true $egs_opts --max-jobs-run $max_jobs_run" \ + --chain.right-tolerance 1 --chain.left-tolerance 1 \ + --chain.alignment-subsampling-factor 1 \ + --egs.chunk-width 160,140,110,80 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $student_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir $train_opts || exit 1; +fi + +graph_dir=$dir/graph${tgt_graph_affix} +if [ $stage -le 16 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 ${tgt_lang} $dir $graph_dir +fi + +if [ $stage -le 17 ]; then + for dset in $test_sets; do + ( + decode_dir=$dir/decode${tgt_graph_affix}_${dset} + + steps/nnet3/decode.sh --nj $nj --cmd "$decode_cmd" --config conf/decode.config \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + --frames-per-chunk 160 \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset} \ + --skip-scoring true \ + $graph_dir data/${dset}_hires $decode_dir || { echo "Failed decoding in $decode_dir"; touch $dir/.error; } + + $scoring_script --cmd "$decode_cmd" \ + data/${dset}_hires $graph_dir $decode_dir + ) & + done + wait + + if [ -f $dir/.error ]; then + echo "Failed decoding." + exit 1 + fi +fi diff --git a/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_ts_ami_1h.sh b/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_ts_ami_1h.sh new file mode 100755 index 00000000000..0ab38f8b2f3 --- /dev/null +++ b/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_ts_ami_1h.sh @@ -0,0 +1,307 @@ +#!/bin/bash + +# This script does MMI + KL training. +# This script is similar to _b, but supports generates numerator posteriors +# after splitting egs. +set -e -o pipefail -u + +# configs for 'chain' +stage=0 +train_stage=-10 +get_egs_stage=-10 +nj=70 +max_jobs_run=30 + +# seed model params +src_dir=exp/semisup300k/chain/tdnn_lstm_1b_sp +treedir=exp/semisup300k/chain/tree_bi_b +src_ivector_extractor=exp/nnet3/extractor + +tgt_data_dir=data/ami_sdm1_train + +student_mfcc_config=conf/mfcc_hires_16kHz.conf + +student_graph_affix=_pp +student_lang=data/lang_pp_test + +tgt_graph_affix=_ami +tgt_lang=data/lang_ami + +lm_weights=1,0 # src, tgt weight + +tdnn_affix=_1h +chain_affix=_semisup_ts_ami_sdm1 +nnet3_affix=_semisup_ts_ami_sdm1 + +kl_factor_schedule="output=0.0,0.0" +mmi_factor_schedule="output=1.0,1.0" + +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +# training options +num_epochs=2 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +label_delay=5 + +remove_egs=false +common_egs_dir= + +kl_fst_scale=0.5 +egs_opts="--lattice-lm-scale 0.5 --lattice-prune-beam 4.0" +train_opts= + +# decode options +test_sets="ami_sdm1_dev_16kHz ami_sdm1_eval_16kHz" + +scoring_script=local/score.sh + +extra_left_context=50 +extra_right_context=0 + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +egs_opts="$egs_opts --deriv-weights-scp $deriv_weights_scp --add-numerator-post true" + +if [ $stage -le 15 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights \ + --cmd "$train_cmd" \ + $treedir $src_dir/best_path${student_graph_affix}_${tgt_dataset}_sp \ + $dir +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 16 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_$(basename $student_data_dir) \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --chain.kl-factor-schedule "$kl_factor_schedule" \ + --chain.mmi-factor-schedule "$mmi_factor_schedule" \ + --egs.stage $get_egs_stage --egs.get-egs-script "steps/nnet3/chain/get_egs_split.sh" \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true $egs_opts --max-jobs-run $max_jobs_run" \ + --chain.right-tolerance 1 --chain.left-tolerance 1 \ + --chain.alignment-subsampling-factor 1 \ + --egs.chunk-width 160,140,110,80 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $student_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir $train_opts || exit 1; +fi + +graph_dir=$dir/graph${tgt_graph_affix} +if [ $stage -le 16 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 ${tgt_lang} $dir $graph_dir +fi + +if [ $stage -le 17 ]; then + for dset in $test_sets; do + ( + decode_dir=$dir/decode${tgt_graph_affix}_${dset} + + steps/nnet3/decode.sh --nj $nj --cmd "$decode_cmd" --config conf/decode.config \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + --frames-per-chunk 160 \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset} \ + --skip-scoring true \ + $graph_dir data/${dset}_hires $decode_dir || { echo "Failed decoding in $decode_dir"; touch $dir/.error; } + + $scoring_script --cmd "$decode_cmd" \ + data/${dset}_hires $graph_dir $decode_dir + ) & + done + wait + + if [ -f $dir/.error ]; then + echo "Failed decoding." + exit 1 + fi +fi diff --git a/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_ts_ami_1i.sh b/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_ts_ami_1i.sh new file mode 100755 index 00000000000..8f1ad853201 --- /dev/null +++ b/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_ts_ami_1i.sh @@ -0,0 +1,420 @@ +#!/bin/bash + +# This script does MMI + KL training. +# This script is similar to _b, but supports generates numerator posteriors +# after splitting egs. +set -e -o pipefail -u + +# configs for 'chain' +stage=0 +train_stage=-10 +get_egs_stage=-10 +nj=70 +max_jobs_run=30 +exp=exp/semisup300k + +# seed model params +src_dir=exp/semisup300k/chain/tdnn_lstm_1b_sp +treedir=exp/semisup300k/chain/tree_bi_b +src_ivector_extractor=exp/nnet3/extractor + +tgt_data_dir=data/ami_sdm1_train + +student_mfcc_config=conf/mfcc_hires_16kHz.conf + +student_graph_affix=_pp +student_lang=data/lang_pp_test + +sup_lat_dir=exp/semisup300k/chain/tri5b_train_300k_rvb_sp_lats +supervised_set=train_300k_rvb_sp +supervision_weights=1,1 +num_copies=1,1 + +tgt_graph_affix=_ami +tgt_lang=data/lang_ami + +lm_weights=1,3 # src, tgt weight + +tdnn_affix=_1h +chain_affix=_semisup_ts_ami_sdm1 +nnet3_affix=_semisup_ts_ami_sdm1 + +kl_factor_schedule="output-0=0,0 output-1=0,0" +mmi_factor_schedule="output-0=1,1 output-1=1,1" + +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +# training options +num_epochs=2 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +label_delay=5 + +remove_egs=false +sup_egs_dir= +unsup_egs_dir= +unsup_frames_per_eg=150 + +lattice_lm_scale=0.5 +kl_fst_scale=0.5 +unsup_egs_opts="" +train_opts= + +# decode options +test_sets="ami_sdm1_dev_16kHz ami_sdm1_eval_16kHz" + +scoring_script=local/score.sh + +extra_left_context=50 +extra_right_context=0 + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + output name=output-0 input=output.affine@$label_delay + output name=output-1 input=output.affine@$label_delay + + output name=output-0-xent input=output-xent.log-softmax@$label_delay + output name=output-1-xent input=output-xent.log-softmax@$label_delay +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +frame_subsampling_factor=1 +if [ -f $src_dir/frame_subsampling_factor ]; then + frame_subsampling_factor=$(cat $src_dir/frame_subsampling_factor) || exit 1 +fi + +. $dir/configs/vars + +left_context=$[model_left_context + chunk_left_context] +right_context=$[model_right_context + chunk_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +egs_left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +egs_right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +egs_left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +egs_right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +cmvn_opts=`cat $src_dir/cmvn_opts` || exit 1 + +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $src_dir/egs/info/frames_per_eg) + + if [ $stage -le 16 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/aspire-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor $frame_subsampling_factor \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir exp/nnet3${nnet3_affix}_${tgt_dataset}/ivectors_${supervised_set}_16kHz_${tgt_dataset}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_16kHz_hires $dir $sup_lat_dir $sup_egs_dir + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsup_egs_opts="$unsup_egs_opts --deriv-weights-scp $deriv_weights_scp --add-numerator-post true" + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${tgt_dataset}_sp + [ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + if [ $stage -le 17 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/aspire-$(date +'%m_%d_%H_%M')/s5/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + + steps/nnet3/chain/get_egs_split.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + --left-tolerance 1 --right-tolerance 1 \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam 4.0 --kl-latdir $unsup_lat_dir --kl-fst-scale $kl_fst_scale \ + --deriv-weights-scp $deriv_weights_scp \ + --online-ivector-dir exp/nnet3${nnet3_affix}_${tgt_dataset}/ivectors_${supervised_set}_16kHz_${tgt_dataset}_sp_hires \ + --generate-egs-scp true $unsup_egs_opts \ + $student_data_dir $dir \ + $unsup_lat_dir $unsup_egs_dir + + touch $unsup_egs_dir/.nodelete + fi +fi + +if [ $stage -le 18 ]; then + steps/nnet3/chain/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --block-size 128 \ + --lang2weight $supervision_weights --lang2num-copies "$num_copies" \ + 2 $sup_egs_dir $unsup_egs_dir $dir/egs_comb +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 19 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$train_cmd --mem 4G --h-rt 00:15:00" --combine-queue-opt "--h-rt 00:50:00" \ + --feat.online-ivector-dir exp/nnet3${nnet3_affix}_${tgt_dataset}/ivectors_${supervised_set}_16kHz_${tgt_dataset}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --chain.kl-factor-schedule "$kl_factor_schedule" \ + --chain.mmi-factor-schedule "$mmi_factor_schedule" \ + --egs.stage $get_egs_stage --egs.get-egs-script "steps/nnet3/chain/get_egs_split.sh" \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true $unsup_egs_opts --max-jobs-run $max_jobs_run" \ + --chain.right-tolerance 1 --chain.left-tolerance 1 \ + --chain.alignment-subsampling-factor 1 \ + --egs.chunk-width 160,140,110,80 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir "$dir/egs_comb" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $student_data_dir \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir $train_opts || exit 1; +fi + +graph_dir=$dir/graph${tgt_graph_affix} +if [ $stage -le 20 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 ${tgt_lang} $dir $graph_dir +fi + +if [ $stage -le 21 ]; then + rm -f $dir/.error + for dset in $test_sets; do + ( + decode_dir=$dir/decode${tgt_graph_affix}_${dset} + + steps/nnet3/decode.sh --nj $nj --cmd "$decode_cmd" --config conf/decode.config \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + --frames-per-chunk 160 \ + --online-ivector-dir exp/nnet3${nnet3_affix}_${tgt_dataset}/ivectors_${dset} \ + --skip-scoring true \ + $graph_dir data/${dset}_hires $decode_dir || { echo "Failed decoding in $decode_dir"; touch $dir/.error; } + + $scoring_script --cmd "$decode_cmd" \ + data/${dset}_hires $graph_dir $decode_dir + ) & + done + wait + + if [ -f $dir/.error ]; then + echo "Failed decoding." + exit 1 + fi +fi diff --git a/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_ts_ami_1j.sh b/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_ts_ami_1j.sh new file mode 100755 index 00000000000..428105469fa --- /dev/null +++ b/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_ts_ami_1j.sh @@ -0,0 +1,421 @@ +#!/bin/bash + +# This script does MMI + KL training. +# This script is similar to _b, but supports generates numerator posteriors +# after splitting egs. +set -e -o pipefail -u + +# configs for 'chain' +stage=0 +train_stage=-10 +get_egs_stage=-10 +nj=70 +max_jobs_run=30 +exp=exp/semisup300k + +# seed model params +src_dir=exp/semisup300k/chain/tdnn_lstm_1b_sp +treedir=exp/semisup300k/chain/tree_bi_b +src_ivector_extractor=exp/nnet3/extractor + +tgt_data_dir=data/ami_sdm1_train + +student_mfcc_config=conf/mfcc_hires_16kHz.conf + +student_graph_affix=_pp +student_lang=data/lang_pp_test + +sup_lat_dir=exp/semisup300k/chain/tri5b_train_300k_rvb_sp_lats +supervised_set=train_300k_rvb_sp +supervision_weights=1,1 +num_copies=1,1 + +tgt_graph_affix=_ami +tgt_lang=data/lang_ami + +lm_weights=1,3 # src, tgt weight + +tdnn_affix=_1j +chain_affix=_semisup_ts_ami_sdm1 +nnet3_affix=_semisup_ts_ami_sdm1 + +kl_factor_schedule="output-0=0,0 output-1=0,0" +mmi_factor_schedule="output-0=1,1 output-1=1,1" + +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +# training options +num_epochs=2 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +label_delay=5 + +remove_egs=false +sup_egs_dir= +unsup_egs_dir= +unsup_frames_per_eg=150 + +lattice_lm_scale=0.5 +kl_fst_scale=0.5 +unsup_egs_opts="" +train_opts= + +# decode options +test_sets="ami_sdm1_dev_16kHz ami_sdm1_eval_16kHz" + +scoring_script=local/score.sh + +extra_left_context=50 +extra_right_context=0 + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + output name=output-0 input=output.affine@$label_delay + output name=output-1 input=output.affine@$label_delay + + output name=output-0-xent input=output-xent.log-softmax@$label_delay + output name=output-1-xent input=output-xent.log-softmax@$label_delay +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +frame_subsampling_factor=1 +if [ -f $src_dir/frame_subsampling_factor ]; then + frame_subsampling_factor=$(cat $src_dir/frame_subsampling_factor) || exit 1 +fi + +. $dir/configs/vars + +left_context=$[model_left_context + chunk_left_context] +right_context=$[model_right_context + chunk_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +egs_left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +egs_right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +egs_left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +egs_right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +cmvn_opts=`cat $src_dir/cmvn_opts` || exit 1 + +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $src_dir/egs/info/frames_per_eg) + + if [ $stage -le 16 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/aspire-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor $frame_subsampling_factor \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir exp/nnet3${nnet3_affix}_${tgt_dataset}/ivectors_${supervised_set}_16kHz_${tgt_dataset}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_16kHz_hires $dir $sup_lat_dir $sup_egs_dir + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsup_egs_opts="$unsup_egs_opts --deriv-weights-scp $deriv_weights_scp --add-numerator-post true" + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${tgt_dataset}_sp + [ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + if [ $stage -le 17 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/aspire-$(date +'%m_%d_%H_%M')/s5/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + + steps/nnet3/chain/get_egs_split.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + --left-tolerance 1 --right-tolerance 1 \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam 4.0 --kl-latdir $unsup_lat_dir --kl-fst-scale $kl_fst_scale \ + --deriv-weights-scp $deriv_weights_scp \ + --online-ivector-dir exp/nnet3${nnet3_affix}_${tgt_dataset}/ivectors_${supervised_set}_16kHz_${tgt_dataset}_sp_hires \ + --generate-egs-scp true $unsup_egs_opts \ + $student_data_dir $dir \ + $unsup_lat_dir $unsup_egs_dir + + touch $unsup_egs_dir/.nodelete + fi +fi + +if [ $stage -le 18 ]; then + steps/nnet3/chain/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --block-size 128 \ + --lang2weight $supervision_weights --lang2num-copies "$num_copies" \ + 2 $sup_egs_dir $unsup_egs_dir $dir/egs_comb +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 19 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$train_cmd --mem 4G" --train-queue-opt "--h-rt 00:15:00" \ + --combine-queue-opt "--h-rt 00:50:00" \ + --feat.online-ivector-dir exp/nnet3${nnet3_affix}_${tgt_dataset}/ivectors_${supervised_set}_16kHz_${tgt_dataset}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --chain.kl-factor-schedule "$kl_factor_schedule" \ + --chain.mmi-factor-schedule "$mmi_factor_schedule" \ + --egs.stage $get_egs_stage --egs.get-egs-script "steps/nnet3/chain/get_egs_split.sh" \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true $unsup_egs_opts --max-jobs-run $max_jobs_run" \ + --chain.right-tolerance 1 --chain.left-tolerance 1 \ + --chain.alignment-subsampling-factor 1 \ + --egs.chunk-width 160,140,110,80 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir "$dir/egs_comb" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $student_data_dir \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir $train_opts || exit 1; +fi + +graph_dir=$dir/graph${tgt_graph_affix} +if [ $stage -le 20 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 ${tgt_lang} $dir $graph_dir +fi + +if [ $stage -le 21 ]; then + rm -f $dir/.error + for dset in $test_sets; do + ( + decode_dir=$dir/decode${tgt_graph_affix}_${dset} + + steps/nnet3/decode.sh --nj $nj --cmd "$decode_cmd" --config conf/decode.config \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + --frames-per-chunk 160 \ + --online-ivector-dir exp/nnet3${nnet3_affix}_${tgt_dataset}/ivectors_${dset} \ + --skip-scoring true \ + $graph_dir data/${dset}_hires $decode_dir || { echo "Failed decoding in $decode_dir"; touch $dir/.error; } + + $scoring_script --cmd "$decode_cmd" \ + data/${dset}_hires $graph_dir $decode_dir + ) & + done + wait + + if [ -f $dir/.error ]; then + echo "Failed decoding." + exit 1 + fi +fi diff --git a/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_ts_ami_subset_1a.sh b/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_ts_ami_subset_1a.sh new file mode 100755 index 00000000000..131ea0c8ec9 --- /dev/null +++ b/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_ts_ami_subset_1a.sh @@ -0,0 +1,424 @@ +#!/bin/bash + +set -e -o pipefail -u + +# configs for 'chain' +stage=0 +train_stage=-10 +get_egs_stage=-10 +nj=70 +max_jobs_run=30 + +# seed model params +src_dir=exp/semisup300k/chain/tdnn_lstm_1b_sp +treedir=exp/semisup300k/chain/tree_bi_b +src_ivector_extractor=exp/nnet3/extractor + +supervised_data_dir=data/ami_sdm1_train_reco12 +unsupervised_data_dir=data/ami_sdm1_train + +student_mfcc_config=conf/mfcc_hires_16kHz.conf + +student_graph_affix=_pp +student_lang=data/lang_pp_test +student_rescore_lang=data/lang_pp_test_fg + +tgt_graph_affix=_ami +tgt_lang=data/lang_ami + +supervision_weights=1.0,1.0 +num_copies=1,1 +lm_weights=1,1 + +tdnn_affix=_1a +chain_affix=_semisup_ts_ami_subset_sdm1 +nnet3_affix=_semisup_ts_ami_subset_sdm1 + +hidden_dim=512 +cell_dim=512 +projection_dim=128 + +# training options +num_epochs=2 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +label_delay=5 + +remove_egs=false +common_egs_dir= + +lattice_lm_scale=0.5 +sup_egs_dir= +unsup_egs_dir= +unsup_egs_opts= + +# decode options +test_sets="ami_sdm1_dev_16kHz ami_sdm1_eval_16kHz" + +scoring_script=local/score.sh + +extra_left_context=50 +extra_right_context=0 + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm4 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm4 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine@$label_delay skip-in-init=true + output name=output-1 input=output.affine@$label_delay skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax@$label_delay skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax@$label_delay skip-in-init=true +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +frame_subsampling_factor=3 + +left_context=$[model_left_context + chunk_left_context] +right_context=$[model_right_context + chunk_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +egs_left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +egs_right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +egs_left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +egs_right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +cmvn_opts=`cat $src_dir/cmvn_opts` || exit 1 + +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set}_sp + frames_per_eg=$(cat $src_dir/egs/info/frames_per_eg) + + if [ $stage -le 16 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/aspire-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 1 \ + --left-tolerance 1 --right-tolerance 1 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_$(basename $unsup_student_data_dir) \ + --generate-egs-scp true \ + $sup_student_data_dir $dir $sup_lat_dir $sup_egs_dir + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsup_frames_per_eg= +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set} + [ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + if [ $stage -le 17 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/aspire-$(date +'%m_%d_%H_%M')/s5/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + + steps/nnet3/chain/get_egs_split.sh --cmd "$decode_cmd" \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 1 \ + --left-tolerance 1 --right-tolerance 1 \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam 4.0 \ + --deriv-weights-scp $deriv_weights_scp \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_$(basename $unsup_student_data_dir) \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_sp_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + + touch $unsup_egs_dir/.nodelete + fi +fi + +if [ $stage -le 18 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 64 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. --lang2num-copies "$num_copies" \ + 2 $sup_egs_dir $unsup_egs_dir $dir/egs_comb +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 19 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_$(basename $unsup_student_data_dir) \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true --max-jobs-run $max_jobs_run" \ + --chain.right-tolerance 1 --chain.left-tolerance 1 \ + --chain.alignment-subsampling-factor 1 \ + --egs.chunk-width 160,140,110,80 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $unsup_student_data_dir \ + --tree-dir $treedir \ + --lat-dir $unsup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph${tgt_graph_affix} +if [ $stage -le 20 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 ${tgt_lang} $dir $graph_dir +fi + +if [ $stage -le 21 ]; then + for dset in $test_sets; do + ( + decode_dir=$dir/decode${tgt_graph_affix}_${dset} + + steps/nnet3/decode.sh --nj $nj --cmd "$decode_cmd" --config conf/decode.config \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + --frames-per-chunk 160 \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_$dset \ + --skip-scoring true \ + $graph_dir data/${dset}_hires $decode_dir || { echo "Failed decoding in $decode_dir"; touch $dir/.error; } + + $scoring_script --cmd "$decode_cmd" \ + data/${dset}_hires $graph_dir $decode_dir + ) & + done + wait + + if [ -f $dir/.error ]; then + echo "Failed decoding." + exit 1 + fi +fi diff --git a/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_wt_1b.sh b/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_wt_1b.sh new file mode 100755 index 00000000000..ce2cf81516d --- /dev/null +++ b/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_wt_1b.sh @@ -0,0 +1,430 @@ +#!/bin/bash + +set -e + +# configs for 'chain' +affix=v8 + +stage=7 # skip ivector extractor training as it is already done for baseline system +train_stage=-10 +get_egs_stage=-10 +nj=70 +max_jobs_run=30 +test_stage=0 + +exp=exp/semisup300k + +# seed model params +src_dir=exp/semisup300k/chain_norvb/tdnn_lstm_1a_sp +treedir=exp/semisup300k/chain_norvb/tree_bi_a +src_ivector_extractor=exp/nnet3_norvb/extractor + +extractor=exp/nnet3/extractor + +sup_lat_dir=exp/semisup300k/chain/tri5b_train_300k_rvb_sp_lats +supervised_set=train_300k_rvb_sp + +norvb_unsupervised_set=train +unsupervised_set=train_rvb + +tdnn_affix=_1b +chain_affix=_semisup_wt + +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +# training options +num_epochs=1 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +label_delay=5 + +lm_weights=1,3 # 3 - To compensate for using alignments before reverberation. +supervision_weights=1.0,1.0 +num_copies=2,1 + +# decode options +extra_left_context=50 +extra_right_context=0 +decode_iter= + +# training options +remove_egs=false +sup_egs_dir= +unsup_egs_dir= +unsup_egs_opts= + +num_data_reps=3 +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat < $unsup_lat_dir/uttlist.$n.$nj + #done + + rm -f $unsup_lat_dir/lat_tmp.*.{ark,scp} 2>/dev/null + + # Copy the lattices temporarily + norvb_nj=$(cat $norvb_unsup_lat_dir/num_jobs) + $train_cmd --max-jobs-run $max_jobs_run JOB=1:$norvb_nj $unsup_lat_dir/log/copy_lattices.JOB.log \ + lattice-copy --write-compact=false "ark:gunzip -c $norvb_unsup_lat_dir/lat.JOB.gz |" \ + ark,scp:$unsup_lat_dir/lat_tmp.JOB.ark,$unsup_lat_dir/lat_tmp.JOB.scp || exit 1 + + # Make copies of utterances for perturbed data + for n in `seq 3`; do + cat $unsup_lat_dir/lat_tmp.*.scp | awk -v n=$n '{print "rev"n"_"$1" "$2}' + done | sort -k1,1 > $unsup_lat_dir/lat_rvb.scp + + # Copy and dump the lattices for perturbed data + $train_cmd --max-jobs-run $max_jobs_run JOB=1:$nj $unsup_lat_dir/log/copy_rvb_lattices.JOB.log \ + lattice-copy --write-compact=false \ + "scp:utils/filter_scp.pl data/${unsupervised_set}_hires/split$nj/JOB/utt2spk $unsup_lat_dir/lat_rvb.scp |" \ + "ark:| gzip -c > $unsup_lat_dir/lat.JOB.gz" || exit 1 + + rm $unsup_lat_dir/lat_tmp.* $unsup_lat_dir/lat_rvb.scp + + echo $nj > $unsup_lat_dir/num_jobs + + for f in cmvn_opts final.mdl splice_opts tree frame_subsampling_factor; do + if [ -f $norvb_unsup_lat_dir/$f ]; then cp $norvb_unsup_lat_dir/$f $unsup_lat_dir/$f; fi + done +fi + +ln -sf ../final.mdl $unsup_lat_dir/final.mdl + +if [ $stage -le 12 ]; then + steps/best_path_weights.sh --cmd "$decode_cmd" \ + ${norvb_train_data_dir} $decode_lang ${norvb_unsup_lat_dir} \ + $src_dir/best_path_${norvb_unsupervised_set} +fi + +if [ $stage -le 13 ]; then + norvb_weights_dir=$src_dir/best_path_${norvb_unsupervised_set} + norvb_nj=$(cat $norvb_weights_dir/num_jobs) + + mkdir -p $src_dir/best_path_${unsupervised_set} + for n in `seq 3`; do + cat $norvb_weights_dir/weights.scp | awk -v n=$n '{print "rev"n"_"$1" "$2}' + done | sort -k1,1 > $src_dir/best_path_${unsupervised_set}/weights.scp +fi + +deriv_weights_scp=$src_dir/best_path_${unsupervised_set}/weights.scp + +if [ $stage -le 14 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${src_dir}/best_path_${norvb_unsupervised_set} $dir +fi + +# if [ $stage -le 15 ]; then +# echo "$0: creating neural net configs using the xconfig parser"; +# +# num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') +# learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) +# +# lstm_opts="decay-time=40" +# +# mkdir -p $dir/configs +# cat < $dir/configs/network.xconfig +# input dim=100 name=ivector +# input dim=40 name=input +# +# # please note that it is important to have input layer with the name=input +# # as the layer immediately preceding the fixed-affine-layer to enable +# # the use of short notation for the descriptor +# fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat +# +# # the first splicing is moved before the lda layer, so no splicing here +# relu-batchnorm-layer name=tdnn1 dim=$hidden_dim +# relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim +# relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim +# +# fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts +# relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim +# relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim +# fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts +# relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim +# relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim +# fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts +# relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=$hidden_dim +# relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=$hidden_dim +# fast-lstmp-layer name=lstm4 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts +# +# ## adding the layers for chain branch +# output-layer name=output input=lstm4 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 +# +# +# # adding the layers for xent branch +# # This block prints the configs for a separate output that will be +# # trained with a cross-entropy objective in the 'chain' models... this +# # has the effect of regularizing the hidden parts of the model. we use +# # 0.5 / args.xent_regularize as the learning rate factor- the factor of +# # 0.5 / args.xent_regularize is suitable as it means the xent +# # final-layer learns at a rate independent of the regularization +# # constant; and the 0.5 was tuned so as to make the relative progress +# # similar in the xent and regular final layers. +# output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +# +# output name=output-0 input=output.affine@$label_delay skip-in-init=true +# output name=output-1 input=output.affine@$label_delay skip-in-init=true +# +# output name=output-0-xent input=output-xent.log-softmax@$label_delay skip-in-init=true +# output name=output-1-xent input=output-xent.log-softmax@$label_delay skip-in-init=true +# EOF +# steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +# fi + +. $src_dir/configs/vars + +frame_subsampling_factor=3 + +left_context=$[model_left_context + chunk_left_context] +right_context=$[model_right_context + chunk_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +egs_left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +egs_right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +egs_left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +egs_right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +cmvn_opts=`cat $src_dir/cmvn_opts` || exit 1 + +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $src_dir/egs/info/frames_per_eg) + + if [ $stage -le 16 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/aspire-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor $frame_subsampling_factor \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $(dirname $extractor)/ivectors_${supervised_set} \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir $sup_lat_dir $sup_egs_dir + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set} + [ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + if [ $stage -le 17 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/aspire-$(date +'%m_%d_%H_%M')/s5/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + + steps/nnet3/chain/get_egs_split.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + --left-tolerance 1 --right-tolerance 1 \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale 0.5 \ + --lattice-prune-beam 4.0 \ + --deriv-weights-scp $deriv_weights_scp \ + --online-ivector-dir $(dirname $extractor)/ivectors_${unsupervised_set} \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + + touch $unsup_egs_dir/.nodelete + fi +fi + +if [ $stage -le 18 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 64 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. --lang2num-copies "$num_copies" \ + 2 $sup_egs_dir $unsup_egs_dir $dir/egs_comb +fi + +if [ $train_stage -le -1 ]; then + train_stage=-1 +fi + +if [ $stage -le 19 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" --trainer.input-model $src_dir/final.mdl \ + --feat.online-ivector-dir $(dirname $extractor)/ivectors_${unsupervised_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --egs.stage $get_egs_stage --egs.get-egs-script=$get_egs_script \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true $egs_opts --max-jobs-run $max_jobs_run" \ + --chain.right-tolerance 1 --chain.left-tolerance 1 \ + --chain.alignment-subsampling-factor 1 \ + --egs.chunk-width 160,140,110,80 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir $dir/egs_comb \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph_pp +if [ $stage -le 20 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_pp_test $dir $graph_dir +fi + +if [ $stage -le 21 ]; then +#%WER 27.8 | 2120 27217 | 78.2 13.6 8.2 6.0 27.8 75.9 | -0.613 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iterfinal_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys + local/nnet3/decode.sh --stage $test_stage --decode-num-jobs 30 --affix "$affix" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --window 10 --overlap 5 \ + --sub-speaker-frames 6000 --max-count 75 --ivector-scale 0.75 \ + --pass2-decode-opts "--min-active 1000" \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + dev_aspire_ldc data/lang $dir/graph_pp $dir +fi + +exit 0 + +#if [ $stage -le 15 ]; then +# #Online decoding example +# %WER 31.5 | 2120 27224 | 74.0 13.0 13.0 5.5 31.5 77.1 | -0.558 | exp/chain/tdnn_7b_online/decode_dev_aspire_whole_uniformsegmented_win10_over5_v9_online_iterfinal_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys + +# local/nnet3/prep_test_aspire_online.sh --stage 2 --decode-num-jobs 30 --affix "v7" \ +# --acwt 1.0 --post-decode-acwt 10.0 \ +# --window 10 --overlap 5 \ +# --max-count 75 \ +# --pass2-decode-opts "--min-active 1000" \ +# dev_aspire data/lang $dir/graph_pp exp/chain/tdnn_7b +#fi + + + + +exit 0; + +# %WER 32.7 | 2120 27222 | 73.6 15.3 11.2 6.3 32.7 78.5 | -0.530 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter100_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 30.4 | 2120 27211 | 74.8 12.7 12.5 5.1 30.4 77.0 | -0.458 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter200_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys +# %WER 29.1 | 2120 27216 | 76.6 13.8 9.6 5.7 29.1 76.8 | -0.527 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter300_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.8 | 2120 27211 | 77.0 13.8 9.2 5.8 28.8 76.3 | -0.587 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter400_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.7 | 2120 27218 | 77.1 13.8 9.1 5.8 28.7 77.0 | -0.566 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter500_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.5 | 2120 27210 | 77.5 13.9 8.7 6.0 28.5 76.1 | -0.596 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter600_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.2 | 2120 27217 | 77.0 12.4 10.6 5.2 28.2 75.8 | -0.540 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter700_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys +# %WER 28.4 | 2120 27218 | 77.6 13.6 8.8 6.0 28.4 76.3 | -0.607 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter800_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.2 | 2120 27208 | 77.4 12.6 10.0 5.6 28.2 76.6 | -0.555 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter900_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys +# %WER 27.8 | 2120 27214 | 78.0 13.5 8.5 5.9 27.8 75.9 | -0.631 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1000_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 27.9 | 2120 27216 | 77.6 13.0 9.4 5.5 27.9 76.1 | -0.544 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1200_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys +# %WER 27.8 | 2120 27216 | 77.4 13.1 9.5 5.3 27.8 75.7 | -0.615 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1300_pp_fg/score_9/penalty_0.25/ctm.filt.filt.sys +# %WER 27.7 | 2120 27220 | 78.1 13.6 8.3 5.8 27.7 75.1 | -0.569 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1400_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 27.7 | 2120 27217 | 78.1 13.6 8.3 5.9 27.7 75.1 | -0.605 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1500_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys diff --git a/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_wt_ami_1a.sh b/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_wt_ami_1a.sh new file mode 100755 index 00000000000..8779918f3c0 --- /dev/null +++ b/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_semisup_wt_ami_1a.sh @@ -0,0 +1,297 @@ +#!/bin/bash + +set -e -o pipefail -u + +# configs for 'chain' +stage=0 +train_stage=-10 +get_egs_stage=-10 +nj=70 +max_jobs_run=30 + +# seed model params +src_dir=exp/semisup300k/chain/tdnn_lstm_1b_sp +treedir=exp/semisup300k/chain/tree_bi_b +src_ivector_extractor=exp/nnet3/extractor + +tgt_data_dir=data/ami_sdm1_train + +student_mfcc_config=conf/mfcc_hires.conf + +student_graph_affix=_pp +student_lang=data/lang_pp_test +student_rescore_lang=data/lang_pp_test_fg + +tgt_graph_affix=_ami +tgt_lang=data/lang_ami + +tdnn_affix=_1b +chain_affix=_semisup_wt_ami_sdm1 +nnet3_affix=_semisup_ts_ami_sdm1 + +kl_factor_schedule="output=0.5,0.5" +mmi_factor_schedule="output=0.5,0.5" + +hidden_dim=512 +cell_dim=512 +projection_dim=128 + +primary_lr_factor=0.25 + +# training options +num_epochs=2 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +label_delay=5 + +remove_egs=false +common_egs_dir= + +# decode options +test_sets="ami_sdm1_dev ami_sdm1_eval" + +scoring_script=local/score.sh + +extra_left_context=50 +extra_right_context=0 + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig +# input dim=100 name=ivector +# input dim=40 name=input +# +# # please note that it is important to have input layer with the name=input +# # as the layer immediately preceding the fixed-affine-layer to enable +# # the use of short notation for the descriptor +# fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat +# +# # the first splicing is moved before the lda layer, so no splicing here +# relu-batchnorm-layer name=tdnn1 dim=$hidden_dim +# relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim +# relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim +# +# fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts +# relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim +# relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim +# fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts +# relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim +# relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim +# fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts +# relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=$hidden_dim +# relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=$hidden_dim +# fast-lstmp-layer name=lstm4 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts +# +# ## adding the layers for chain branch +# output-layer name=output input=lstm4 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 +# +# # adding the layers for xent branch +# # This block prints the configs for a separate output that will be +# # trained with a cross-entropy objective in the 'chain' models... this +# # has the effect of regularizing the hidden parts of the model. we use +# # 0.5 / args.xent_regularize as the learning rate factor- the factor of +# # 0.5 / args.xent_regularize is suitable as it means the xent +# # final-layer learns at a rate independent of the regularization +# # constant; and the 0.5 was tuned so as to make the relative progress +# # similar in the xent and regular final layers. +# output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +# +# output name=output-0 input=output.affine@$label_delay skip-in-init=true +# output name=output-1 input=output.affine@$label_delay skip-in-init=true +# +# output name=output-0-xent input=output-xent.log-softmax@$label_delay skip-in-init=true +# output name=output-1-xent input=output-xent.log-softmax@$label_delay skip-in-init=true +#EOF +# steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +#fi + +if [ $stage -le 12 ]; then + # Set the learning-rate-factor for all transferred layers but the last output + # layer to primary_lr_factor. + $train_cmd $dir/log/generate_input_mdl.log \ + nnet3-copy --edits="set-learning-rate-factor name=* learning-rate-factor=$primary_lr_factor; set-learning-rate-factor name=output* learning-rate-factor=1.0" \ + $src_dir/final.mdl $dir/input.raw || exit 1; +fi + +egs_opts="--kl-fst-scale 0.5 --lattice-lm-scale 0.5 --lattice-prune-beam 4.0 --deriv-weights-scp $deriv_weights_scp --add-numerator-post true" + +if [ $stage -le 13 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" --trainer.input-model $dir/input.raw \ + --feat.online-ivector-dir $teacher_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --chain.mmi-factor-schedule=$mmi_factor_schedule \ + --chain.kl-factor-schedule=$kl_factor_schedule \ + --egs.stage $get_egs_stage --egs.get-egs-script "steps/nnet3/chain/get_egs_split.sh" \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true $egs_opts --max-jobs-run $max_jobs_run" \ + --chain.right-tolerance 1 --chain.left-tolerance 1 \ + --chain.alignment-subsampling-factor 1 \ + --egs.chunk-width 160,140,110,80 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $teacher_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph${tgt_graph_affix} +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 ${tgt_lang} $dir $graph_dir +fi + +if [ $stage -le 15 ]; then + for dset in $test_sets; do + ( + decode_dir=$dir/decode${tgt_graph_affix}_${dset} + + steps/nnet3/decode.sh --nj $nj --cmd "$decode_cmd" --config conf/decode.config \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + --frames-per-chunk 160 \ + --online-ivector-dir $(dirname $src_ivector_extractor)/ivectors_${dset} \ + --skip-scoring true \ + $graph_dir data/${dset}_hires $decode_dir || { echo "Failed decoding in $decode_dir"; touch $dir/.error; } + + $scoring_script --cmd "$decode_cmd" \ + data/${dset}_hires $graph_dir $decode_dir + ) & + done + wait + + if [ -f $dir/.error ]; then + echo "Failed decoding." + exit 1 + fi +fi diff --git a/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_wgt_semisup_wt_1b.sh b/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_wgt_semisup_wt_1b.sh new file mode 100755 index 00000000000..94e6b1b0f96 --- /dev/null +++ b/egs/aspire/s5/local/semisup/chain/tuning/run_tdnn_lstm_300k_wgt_semisup_wt_1b.sh @@ -0,0 +1,430 @@ +#!/bin/bash + +set -e + +# configs for 'chain' +affix=v8 + +stage=7 # skip ivector extractor training as it is already done for baseline system +train_stage=-10 +get_egs_stage=-10 +nj=70 +max_jobs_run=30 +test_stage=0 + +exp=exp/semisup300k + +# seed model params +src_dir=exp/semisup300k/chain_norvb/tdnn_lstm_1a_sp +treedir=exp/semisup300k/chain_norvb/tree_bi_a +src_ivector_extractor=exp/nnet3_norvb/extractor + +extractor=exp/nnet3/extractor + +sup_lat_dir=exp/semisup300k/chain/tri5b_train_300k_rvb_sp_lats +supervised_set=train_300k_rvb_sp + +norvb_unsupervised_set=train +unsupervised_set=train_rvb + +tdnn_affix=_1b +chain_affix=_semisup_ts + +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +# training options +num_epochs=2 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +label_delay=5 + +lm_weights=1,3 # 3 - To compensate for using alignments before reverberation. +supervision_weights=1.0,1.0 +num_copies=2,1 + +# decode options +extra_left_context=50 +extra_right_context=0 +decode_iter= + +# training options +remove_egs=false +sup_egs_dir= +unsup_egs_dir= +unsup_egs_opts= + +num_data_reps=3 +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat < $unsup_lat_dir/uttlist.$n.$nj + #done + + rm -f $unsup_lat_dir/lat_tmp.*.{ark,scp} 2>/dev/null + + # Copy the lattices temporarily + norvb_nj=$(cat $norvb_unsup_lat_dir/num_jobs) + $train_cmd --max-jobs-run $max_jobs_run JOB=1:$norvb_nj $unsup_lat_dir/log/copy_lattices.JOB.log \ + lattice-copy --write-compact=false "ark:gunzip -c $norvb_unsup_lat_dir/lat.JOB.gz |" \ + ark,scp:$unsup_lat_dir/lat_tmp.JOB.ark,$unsup_lat_dir/lat_tmp.JOB.scp || exit 1 + + # Make copies of utterances for perturbed data + for n in `seq 3`; do + cat $unsup_lat_dir/lat_tmp.*.scp | awk -v n=$n '{print "rev"n"_"$1" "$2}' + done | sort -k1,1 > $unsup_lat_dir/lat_rvb.scp + + # Copy and dump the lattices for perturbed data + $train_cmd --max-jobs-run $max_jobs_run JOB=1:$nj $unsup_lat_dir/log/copy_rvb_lattices.JOB.log \ + lattice-copy --write-compact=false \ + "scp:utils/filter_scp.pl data/${unsupervised_set}_hires/split$nj/JOB/utt2spk $unsup_lat_dir/lat_rvb.scp |" \ + "ark:| gzip -c > $unsup_lat_dir/lat.JOB.gz" || exit 1 + + rm $unsup_lat_dir/lat_tmp.* $unsup_lat_dir/lat_rvb.scp + + echo $nj > $unsup_lat_dir/num_jobs + + for f in cmvn_opts final.mdl splice_opts tree frame_subsampling_factor; do + if [ -f $norvb_unsup_lat_dir/$f ]; then cp $norvb_unsup_lat_dir/$f $unsup_lat_dir/$f; fi + done +fi + +ln -sf ../final.mdl $unsup_lat_dir/final.mdl + +if [ $stage -le 12 ]; then + steps/best_path_weights.sh --cmd "$decode_cmd" \ + ${norvb_train_data_dir} $decode_lang ${norvb_unsup_lat_dir} \ + $src_dir/best_path_${norvb_unsupervised_set} +fi + +if [ $stage -le 13 ]; then + norvb_weights_dir=$src_dir/best_path_${norvb_unsupervised_set} + norvb_nj=$(cat $norvb_weights_dir/num_jobs) + + mkdir -p $src_dir/best_path_${unsupervised_set} + for n in `seq 3`; do + cat $norvb_weights_dir/weights.scp | awk -v n=$n '{print "rev"n"_"$1" "$2}' + done | sort -k1,1 > $src_dir/best_path_${unsupervised_set}/weights.scp +fi + +deriv_weights_scp=$src_dir/best_path_${unsupervised_set}/weights.scp + +if [ $stage -le 14 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${src_dir}/best_path_${norvb_unsupervised_set} $dir +fi + +if [ $stage -le 15 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm4 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm4 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine@$label_delay skip-in-init=true + output name=output-1 input=output.affine@$label_delay skip-in-init=true + + output name=output-0-xent input=output-xent.log-softmax@$label_delay skip-in-init=true + output name=output-1-xent input=output-xent.log-softmax@$label_delay skip-in-init=true +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +frame_subsampling_factor=3 + +left_context=$[model_left_context + chunk_left_context] +right_context=$[model_right_context + chunk_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +egs_left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +egs_right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +egs_left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +egs_right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +cmvn_opts=`cat $src_dir/cmvn_opts` || exit 1 + +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $src_dir/egs/info/frames_per_eg) + + if [ $stage -le 16 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/aspire-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor $frame_subsampling_factor \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $(dirname $extractor)/ivectors_${supervised_set} \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir $sup_lat_dir $sup_egs_dir + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set} + [ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + if [ $stage -le 17 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/aspire-$(date +'%m_%d_%H_%M')/s5/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + + steps/nnet3/chain/get_egs_split.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + --left-tolerance 1 --right-tolerance 1 \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale 0.5 \ + --lattice-prune-beam 4.0 \ + --deriv-weights-scp $deriv_weights_scp \ + --online-ivector-dir $(dirname $extractor)/ivectors_${unsupervised_set} \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + + touch $unsup_egs_dir/.nodelete + fi +fi + +if [ $stage -le 18 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 64 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. --lang2num-copies "$num_copies" \ + 2 $sup_egs_dir $unsup_egs_dir $dir/egs_comb +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 19 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $(dirname $extractor)/ivectors_${unsupervised_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --egs.stage $get_egs_stage --egs.get-egs-script=$get_egs_script \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true $egs_opts --max-jobs-run $max_jobs_run" \ + --chain.right-tolerance 1 --chain.left-tolerance 1 \ + --chain.alignment-subsampling-factor 1 \ + --egs.chunk-width 160,140,110,80 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir $dir/egs_comb \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph_pp +if [ $stage -le 20 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_pp_test $dir $graph_dir +fi + +if [ $stage -le 21 ]; then +#%WER 27.8 | 2120 27217 | 78.2 13.6 8.2 6.0 27.8 75.9 | -0.613 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iterfinal_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys + local/nnet3/decode.sh --stage $test_stage --decode-num-jobs 30 --affix "$affix" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --window 10 --overlap 5 \ + --sub-speaker-frames 6000 --max-count 75 --ivector-scale 0.75 \ + --pass2-decode-opts "--min-active 1000" \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + dev_aspire_ldc data/lang $dir/graph_pp $dir +fi + +exit 0 + +#if [ $stage -le 15 ]; then +# #Online decoding example +# %WER 31.5 | 2120 27224 | 74.0 13.0 13.0 5.5 31.5 77.1 | -0.558 | exp/chain/tdnn_7b_online/decode_dev_aspire_whole_uniformsegmented_win10_over5_v9_online_iterfinal_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys + +# local/nnet3/prep_test_aspire_online.sh --stage 2 --decode-num-jobs 30 --affix "v7" \ +# --acwt 1.0 --post-decode-acwt 10.0 \ +# --window 10 --overlap 5 \ +# --max-count 75 \ +# --pass2-decode-opts "--min-active 1000" \ +# dev_aspire data/lang $dir/graph_pp exp/chain/tdnn_7b +#fi + + + + +exit 0; + +# %WER 32.7 | 2120 27222 | 73.6 15.3 11.2 6.3 32.7 78.5 | -0.530 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter100_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 30.4 | 2120 27211 | 74.8 12.7 12.5 5.1 30.4 77.0 | -0.458 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter200_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys +# %WER 29.1 | 2120 27216 | 76.6 13.8 9.6 5.7 29.1 76.8 | -0.527 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter300_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.8 | 2120 27211 | 77.0 13.8 9.2 5.8 28.8 76.3 | -0.587 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter400_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.7 | 2120 27218 | 77.1 13.8 9.1 5.8 28.7 77.0 | -0.566 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter500_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.5 | 2120 27210 | 77.5 13.9 8.7 6.0 28.5 76.1 | -0.596 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter600_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.2 | 2120 27217 | 77.0 12.4 10.6 5.2 28.2 75.8 | -0.540 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter700_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys +# %WER 28.4 | 2120 27218 | 77.6 13.6 8.8 6.0 28.4 76.3 | -0.607 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter800_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.2 | 2120 27208 | 77.4 12.6 10.0 5.6 28.2 76.6 | -0.555 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter900_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys +# %WER 27.8 | 2120 27214 | 78.0 13.5 8.5 5.9 27.8 75.9 | -0.631 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1000_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 27.9 | 2120 27216 | 77.6 13.0 9.4 5.5 27.9 76.1 | -0.544 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1200_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys +# %WER 27.8 | 2120 27216 | 77.4 13.1 9.5 5.3 27.8 75.7 | -0.615 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1300_pp_fg/score_9/penalty_0.25/ctm.filt.filt.sys +# %WER 27.7 | 2120 27220 | 78.1 13.6 8.3 5.8 27.7 75.1 | -0.569 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1400_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 27.7 | 2120 27217 | 78.1 13.6 8.3 5.9 27.7 75.1 | -0.605 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1500_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys diff --git a/egs/aspire/s5/local/semisup/copy_lat_dir.sh b/egs/aspire/s5/local/semisup/copy_lat_dir.sh new file mode 100755 index 00000000000..6aefd24a0b8 --- /dev/null +++ b/egs/aspire/s5/local/semisup/copy_lat_dir.sh @@ -0,0 +1,52 @@ +#!/bin/bash + +utt_prefixes= +max_jobs_run=30 +nj=100 +cmd=queue.pl +write_compact=true + +. ./path.sh +. utils/parse_options.sh + +if [ $# -ne 3 ]; then + echo "Usage: $0 " + exit 1 +fi + +data=$1 +src_dir=$2 +dir=$3 + +mkdir -p $dir + +num_jobs=$(cat $src_dir/num_jobs) + +rm -f $dir/lat_tmp.*.{ark,scp} 2>/dev/null + +# Copy the lattices temporarily +$cmd --max-jobs-run $max_jobs_run JOB=1:$num_jobs $dir/log/copy_lattices.JOB.log \ + lattice-copy --write-compact=$write_compact \ + "ark:gunzip -c $src_dir/lat.JOB.gz |" \ + ark,scp:$dir/lat_tmp.JOB.ark,$dir/lat_tmp.JOB.scp || exit 1 + +# Make copies of utterances for perturbed data +for p in $utt_prefixes; do + cat $dir/lat_tmp.*.scp | awk -v p=$p '{print p$0}' +done | sort -k1,1 > $dir/lat_out.scp + +utils/split_data.sh ${data} $nj + +# Copy and dump the lattices for perturbed data +$cmd --max-jobs-run $max_jobs_run JOB=1:$nj $dir/log/copy_out_lattices.JOB.log \ + lattice-copy --write-compact=$write_compact \ + "scp:utils/filter_scp.pl ${data}/split$nj/JOB/utt2spk $dir/lat_out.scp |" \ + "ark:| gzip -c > $dir/lat.JOB.gz" || exit 1 + +rm $dir/lat_tmp.* $dir/lat_out.scp + +echo $nj > $dir/num_jobs + +for f in cmvn_opts final.mdl splice_opts tree frame_subsampling_factor; do + if [ -f $src_dir/$f ]; then cp $src_dir/$f $dir/$f; fi +done diff --git a/egs/aspire/s5/local/semisup/nnet3/run_student_ivector_common.sh b/egs/aspire/s5/local/semisup/nnet3/run_student_ivector_common.sh new file mode 100755 index 00000000000..b3cb3d1ec1a --- /dev/null +++ b/egs/aspire/s5/local/semisup/nnet3/run_student_ivector_common.sh @@ -0,0 +1,104 @@ +#!/bin/bash + +nnet3_affix= +stage=1 + +orig_data_dir=data/ami_sdm1_train_sp_hires +student_data_dir=data/ami_sdm1_train_16kHz_sp_hires +student_mfcc_config=conf/mfcc_hires_16kHz.conf + +test_sets="ami_sdm1_dev_16kHz ami_sdm1_eval_16kHz" + +num_threads_ubm=16 +nj=40 + +echo "$0 $@" # Print the command line for logging + +. ./path.sh +. ./cmd.sh + +set -e -o pipefail -u + +. utils/parse_options.sh + +if [ $# -ne 0 ]; then + echo "Usage: $0" + exit 1 +fi + +if [ $stage -le 1 ]; then + if [ -f $student_data_dir/feats.scp ]; then + echo "$0: $student_data_dir/feats.scp exists. Remove it and skip this stage." + exit 1 + fi + + utils/copy_data_dir.sh $orig_data_dir $student_data_dir + + steps/make_mfcc.sh --mfcc-config $student_mfcc_config --cmd "$train_cmd" --nj $nj \ + $student_data_dir + steps/compute_cmvn_stats.sh $student_data_dir + utils/fix_data_dir.sh $student_data_dir +fi + +if [ $stage -le 2 ]; then + for dset in $test_sets; do + utils/copy_data_dir.sh data/${dset} data/${dset}_hires + steps/make_mfcc.sh --mfcc-config $student_mfcc_config --cmd "$train_cmd" --nj $nj \ + data/${dset}_hires + steps/compute_cmvn_stats.sh data/${dset}_hires + utils/fix_data_dir.sh data/${dset}_hires + done +fi + +if [ $stage -le 3 ]; then + steps/online/nnet2/get_pca_transform.sh --cmd "$train_cmd" \ + --splice-opts "--left-context=3 --right-context=3" \ + --max-utts 30000 --subsample 2 \ + $student_data_dir exp/nnet3${nnet3_affix}/pca_transform +fi + +if [ $stage -le 4 ]; then + num_utts=$(cat $student_data_dir/utt2spk | wc -l) + suffix= + if [ $num_utts -gt 30000 ]; then + utils/subset_data_dir.sh $student_data_dir 30000 ${student_data_dir}_30k + suffix=_30k + fi + + # To train a diagonal UBM we don't need very much data, so use the smallest + # subset. + steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj $nj \ + --num-frames 400000 --num-threads $num_threads_ubm \ + ${student_data_dir}${suffix} 512 exp/nnet3${nnet3_affix}/pca_transform \ + exp/nnet3${nnet3_affix}/diag_ubm +fi + +if [ $stage -le 5 ]; then + num_utts=$(cat $student_data_dir/utt2spk | wc -l) + suffix= + if [ $num_utts -gt 100000 ]; then + utils/subset_data_dir.sh $student_data_dir 100000 ${student_data_dir}_100k + suffix=_100k + fi + # iVector extractors can in general be sensitive to the amount of data, but + # this one has a fairly small dim (defaults to 100) so we don't use all of it, + # we use just the 100k subset (about one sixteenth of the data). + steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj $nj \ + ${student_data_dir}${suffix} exp/nnet3${nnet3_affix}/diag_ubm \ + exp/nnet3${nnet3_affix}/extractor || exit 1; +fi + +if [ $stage -le 6 ]; then + utils/data/modify_speaker_info.sh --utts-per-spk-max 2 \ + $student_data_dir ${student_data_dir}_max2 + + steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj $nj \ + ${student_data_dir}_max2 exp/nnet3${nnet3_affix}/extractor \ + exp/nnet3${nnet3_affix}/ivectors_$(basename $student_data_dir) + + for dset in $test_sets; do + steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj $nj \ + data/${dset}_hires exp/nnet3${nnet3_affix}/extractor \ + exp/nnet3${nnet3_affix}/ivectors_$dset + done +fi diff --git a/egs/aspire/s5/local/semisup/run_300k.sh b/egs/aspire/s5/local/semisup/run_300k.sh new file mode 100644 index 00000000000..08f392bd8f1 --- /dev/null +++ b/egs/aspire/s5/local/semisup/run_300k.sh @@ -0,0 +1,18 @@ +false && { +local/fisher_train_lms_pocolm.sh --text data/train_300k_dev/text --lexicon data/local/dict/lexicon.txt --dir data/local/pocolm_300k --num-ngrams-large 250000 + +local/fisher_create_test_lang.sh --arpa-lm data/local/pocolm_300k/data/arpa/4gram_big.arpa.gz --lang data/lang_300k_pp --dir data/lang_300k_pp_test + +local/semisup/build_silprob.sh +} + +mkdir -p data/lang_300k_pp_ug_test + +oov=`cat data/lang_300k_pp/oov.int` || exit 1; +cp -rT data/lang_300k_pp data/lang_300k_pp_ug_test + +cat data/train_300k_dev/text | utils/sym2int.pl --map-oov $oov -f 2- data/lang_300k_pp/words.txt | \ + awk '{for(n=2;n<=NF;n++){ printf("%s ", $n); } printf("\n"); }' | \ + utils/make_unigram_grammar.pl | fstcompile | fstarcsort --sort_type=ilabel > data/lang_300k_pp_ug_test/G.fst \ + || exit 1; + From 9c87d0d3f60e3cc505424f2127361783e89324f6 Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Thu, 28 Jun 2018 15:02:45 -0400 Subject: [PATCH 166/174] aspire: Adding aspire clean and ts recipes --- .../s5/local/chain/tuning/run_tdnn_lstm_1b.sh | 289 ++++++++++++++ .../chain/tuning/run_tdnn_lstm_kl_ts_1a.sh | 369 ++++++++++++++++++ .../chain/tuning/run_tdnn_lstm_kl_ts_1b.sh | 334 ++++++++++++++++ .../chain/tuning/run_tdnn_lstm_norvb_1a.sh | 247 ++++++++++++ .../tuning/run_tdnn_lstm_semisup_ts_1a.sh | 367 +++++++++++++++++ .../s5/local/fisher_train_lms_pocolm.sh | 186 +++++++++ 6 files changed, 1792 insertions(+) create mode 100755 egs/aspire/s5/local/chain/tuning/run_tdnn_lstm_1b.sh create mode 100644 egs/aspire/s5/local/chain/tuning/run_tdnn_lstm_kl_ts_1a.sh create mode 100644 egs/aspire/s5/local/chain/tuning/run_tdnn_lstm_kl_ts_1b.sh create mode 100755 egs/aspire/s5/local/chain/tuning/run_tdnn_lstm_norvb_1a.sh create mode 100755 egs/aspire/s5/local/chain/tuning/run_tdnn_lstm_semisup_ts_1a.sh create mode 100755 egs/aspire/s5/local/fisher_train_lms_pocolm.sh diff --git a/egs/aspire/s5/local/chain/tuning/run_tdnn_lstm_1b.sh b/egs/aspire/s5/local/chain/tuning/run_tdnn_lstm_1b.sh new file mode 100755 index 00000000000..0192e0ce1fe --- /dev/null +++ b/egs/aspire/s5/local/chain/tuning/run_tdnn_lstm_1b.sh @@ -0,0 +1,289 @@ +#!/bin/bash + +set -e + +# based on run_tdnn_7b.sh in the swbd recipe + +# configs for 'chain' +affix=v8 + +stage=0 +train_stage=-10 +get_egs_stage=-10 + +tdnn_affix=1a +tree_affix=bi_a +chain_affix= + +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +# training options +num_epochs=2 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +label_delay=5 + +# decode options +extra_left_context=50 +extra_right_context=0 +decode_iter= + +# training options +num_epochs=4 +remove_egs=false +common_egs_dir= + +num_data_reps=3 +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat < $lat_dir/uttlist.$n.$nj + done + + norvb_lat_dir=exp/chain${chain_affix}/tri5a_train_lats + + rm -f $lat_dir/lat_tmp.*.{ark,scp} 2>/dev/null + + norvb_nj=$(cat $norvb_lat_dir/num_jobs) + $train_cmd JOB=1:$norvb_nj $lat_dir/JOB/copy_lattices.JOB.log \ + lattice-copy "ark:gunzip -c $norvb_lat_dir/lat.JOB.gz |" \ + ark,scp:$lat_dir/lat_tmp.JOB.ark,$lat_dir/lat_tmp.JOB.scp || exit 1 + + for n in `seq 3`; do + cat $lat_dir/lat_tmp.*.scp | awk '{print "rev"n"_"$1" "$2}' + done > $lat_dir/lat_rvb.scp + + $train_cmd JOB=1:$nj $lat_dir/JOB/copy_rvb_lattices.JOB.log \ + "scp:utils/filter_scp.pl data/${train_set}/split$nj/JOB/utt2spk $lat_dir/lat_rvb.scp |" \ + "ark:gzip -c > $lat_dir/lat.JOB.gz |" || exit 1 + + rm $lat_dir/lat_tmp.* $lat_dir/lat_rvb.scp + + echo $nj > $lat_dir/num_jobs + + for f in cmvn_opts final.mdl splice_opts tree; do + cp $norvb_lat_dir/$f $lat_dir/$f + done +fi + +if [ $stage -le 10 ]; then + # Create a version of the lang/ directory that has one state per phone in the + # topo file. [note, it really has two states.. the first one is only repeated + # once, the second one has zero or more repeats.] + rm -rf $lang + cp -r data/lang $lang + silphonelist=$(cat $lang/phones/silence.csl) || exit 1; + nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1; + # Use our special topology... note that later on may have to tune this + # topology. + steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + # we build the tree using clean features (data/train) rather than + # the augmented features (data/train_rvb) to get better alignments + + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate -1 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 7000 data/train $lang exp/tri5a $treedir || exit 1 +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm4 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm4 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/aspire-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + mkdir -p $dir/egs + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true" \ + --egs.chunk-width 160,140,110,80 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph_pp +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_pp_test $dir $graph_dir +fi + +if [ $stage -le 14 ]; then +#%WER 27.8 | 2120 27217 | 78.2 13.6 8.2 6.0 27.8 75.9 | -0.613 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iterfinal_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys + local/nnet3/prep_test_aspire.sh --stage 1 --decode-num-jobs 30 --affix "$affix" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --window 10 --overlap 5 \ + --sub-speaker-frames 6000 --max-count 75 --ivector-scale 0.75 \ + --pass2-decode-opts "--min-active 1000" \ + dev_aspire data/lang $dir/graph_pp $dir +fi + +#if [ $stage -le 15 ]; then +# #Online decoding example +# %WER 31.5 | 2120 27224 | 74.0 13.0 13.0 5.5 31.5 77.1 | -0.558 | exp/chain/tdnn_7b_online/decode_dev_aspire_whole_uniformsegmented_win10_over5_v9_online_iterfinal_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys + +# local/nnet3/prep_test_aspire_online.sh --stage 2 --decode-num-jobs 30 --affix "v7" \ +# --acwt 1.0 --post-decode-acwt 10.0 \ +# --window 10 --overlap 5 \ +# --max-count 75 \ +# --pass2-decode-opts "--min-active 1000" \ +# dev_aspire data/lang $dir/graph_pp exp/chain/tdnn_7b +#fi + + + + +exit 0; + +# %WER 32.7 | 2120 27222 | 73.6 15.3 11.2 6.3 32.7 78.5 | -0.530 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter100_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 30.4 | 2120 27211 | 74.8 12.7 12.5 5.1 30.4 77.0 | -0.458 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter200_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys +# %WER 29.1 | 2120 27216 | 76.6 13.8 9.6 5.7 29.1 76.8 | -0.527 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter300_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.8 | 2120 27211 | 77.0 13.8 9.2 5.8 28.8 76.3 | -0.587 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter400_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.7 | 2120 27218 | 77.1 13.8 9.1 5.8 28.7 77.0 | -0.566 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter500_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.5 | 2120 27210 | 77.5 13.9 8.7 6.0 28.5 76.1 | -0.596 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter600_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.2 | 2120 27217 | 77.0 12.4 10.6 5.2 28.2 75.8 | -0.540 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter700_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys +# %WER 28.4 | 2120 27218 | 77.6 13.6 8.8 6.0 28.4 76.3 | -0.607 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter800_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.2 | 2120 27208 | 77.4 12.6 10.0 5.6 28.2 76.6 | -0.555 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter900_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys +# %WER 27.8 | 2120 27214 | 78.0 13.5 8.5 5.9 27.8 75.9 | -0.631 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1000_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 27.9 | 2120 27216 | 77.6 13.0 9.4 5.5 27.9 76.1 | -0.544 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1200_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys +# %WER 27.8 | 2120 27216 | 77.4 13.1 9.5 5.3 27.8 75.7 | -0.615 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1300_pp_fg/score_9/penalty_0.25/ctm.filt.filt.sys +# %WER 27.7 | 2120 27220 | 78.1 13.6 8.3 5.8 27.7 75.1 | -0.569 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1400_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 27.7 | 2120 27217 | 78.1 13.6 8.3 5.9 27.7 75.1 | -0.605 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1500_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys diff --git a/egs/aspire/s5/local/chain/tuning/run_tdnn_lstm_kl_ts_1a.sh b/egs/aspire/s5/local/chain/tuning/run_tdnn_lstm_kl_ts_1a.sh new file mode 100644 index 00000000000..590ae64fd97 --- /dev/null +++ b/egs/aspire/s5/local/chain/tuning/run_tdnn_lstm_kl_ts_1a.sh @@ -0,0 +1,369 @@ +#!/bin/bash + +set -e + +# configs for 'chain' +affix=v8 + +stage=7 # skip ivector extractor training as it is already done for baseline system +train_stage=-10 +get_egs_stage=-10 +nj=70 +max_jobs_run=50 + +exp=exp/semisup300k + +# seed model params +src_dir=exp/semisup300k/chain_norvb/tdnn_lstm_1a_sp +treedir=exp/semisup300k/chain_norvb/tree_bi_a +src_ivector_extractor=exp/nnet3_norvb/extractor + +tdnn_affix=_1a +chain_affix=_semisup_kl + +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +lattice_lm_scale=0.5 + +# training options +num_epochs=2 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +label_delay=5 + +# decode options +extra_left_context=50 +extra_right_context=0 +decode_iter= + +# training options +remove_egs=false +common_egs_dir= + +num_data_reps=3 +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat < $lat_dir/uttlist.$n.$nj + #done + + rm -f $lat_dir/lat_tmp.*.{ark,scp} 2>/dev/null + + # Copy the lattices temporarily + norvb_nj=$(cat $norvb_lat_dir/num_jobs) + $train_cmd --max-jobs-run $max_jobs_run JOB=1:$norvb_nj $lat_dir/log/copy_lattices.JOB.log \ + lattice-copy --write-compact=false "ark:gunzip -c $norvb_lat_dir/lat.JOB.gz |" \ + ark,scp:$lat_dir/lat_tmp.JOB.ark,$lat_dir/lat_tmp.JOB.scp || exit 1 + + # Make copies of utterances for perturbed data + for n in `seq 3`; do + cat $lat_dir/lat_tmp.*.scp | awk -v n=$n '{print "rev"n"_"$1" "$2}' + done | sort -k1,1 > $lat_dir/lat_rvb.scp + + # Copy and dump the lattices for perturbed data + $train_cmd --max-jobs-run $max_jobs_run JOB=1:$nj $lat_dir/log/copy_rvb_lattices.JOB.log \ + lattice-copy --write-compact=false \ + "scp:utils/filter_scp.pl data/${train_set}/split$nj/JOB/utt2spk $lat_dir/lat_rvb.scp |" \ + "ark:| gzip -c > $lat_dir/lat.JOB.gz" || exit 1 + + rm $lat_dir/lat_tmp.* $lat_dir/lat_rvb.scp + + echo $nj > $lat_dir/num_jobs + + for f in cmvn_opts final.mdl splice_opts tree frame_subsampling_factor; do + if [ -f $norvb_lat_dir/$f ]; then cp $norvb_lat_dir/$f $lat_dir/$f; fi + done +fi + +ln -sf ../final.mdl $lat_dir/final.mdl + +if [ $stage -le 12 ]; then + steps/best_path_weights.sh --cmd "$decode_cmd" \ + ${norvb_train_data_dir} $decode_lang ${norvb_lat_dir} \ + $src_dir/best_path_${norvb_train_set} +fi + +if [ $stage -le 13 ]; then + norvb_weights_dir=$src_dir/best_path_${norvb_train_set} + norvb_nj=$(cat $norvb_weights_dir/num_jobs) + + mkdir -p $src_dir/best_path_${train_set} + for n in `seq 3`; do + cat $norvb_weights_dir/weights.scp | awk -v n=$n '{print "rev"n"_"$1" "$2}' + done | sort -k1,1 > $src_dir/best_path_${train_set}/weights.scp +fi + +egs_opts="$egs_opts --deriv-weights-scp $src_dir/best_path_${train_set}/weights.scp" + +if [ $stage -le 14 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm4 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm4 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 15 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/aspire-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + mkdir -p $dir/egs + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_ts.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true $egs_opts --max-jobs-run $max_jobs_run" \ + --egs.chunk-width 160,140,110,80 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph_pp +if [ $stage -le 16 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_pp_test $dir $graph_dir +fi + +if [ $stage -le 17 ]; then +#%WER 27.8 | 2120 27217 | 78.2 13.6 8.2 6.0 27.8 75.9 | -0.613 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iterfinal_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys + local/nnet3/decode.sh --stage $test_stage --decode-num-jobs 30 --affix "$affix" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --window 10 --overlap 5 --iter "$decode_iter" \ + --sub-speaker-frames 6000 --max-count 75 --ivector-scale 0.75 \ + --pass2-decode-opts "--min-active 1000" \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + dev_aspire_ldc data/lang $dir/graph_pp $dir +fi + +if [ $stage -le 22 ]; then + rm $dir/.error 2>/dev/null || true + + for d in dev_rvb test_rvb; do + ( + if [ ! -f exp/nnet3/ivectors_${d}/ivector_online.scp ]; then + steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \ + data/${d}_hires exp/nnet3/extractor \ + exp/nnet3/ivectors_${d} || { echo "Failed i-vector extraction for data/${d}_hires"; touch $dir/.error; } + fi + + decode_dir=$dir/decode_${d}_pp${decode_iter:+_iter$decode_iter} + steps/nnet3/decode.sh --nj 30 --cmd "$decode_cmd" --config conf/decode.config \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + --frames-per-chunk 160 ${decode_iter:+--iter $decode_iter} \ + --online-ivector-dir exp/nnet3/ivectors_${d} \ + $graph_dir data/${d}_hires $decode_dir || { echo "Failed decoding in $decode_dir"; touch $dir/.error; } + ) & + done + wait + + if [ -f $dir/.error ]; then + echo "Failed decoding." + exit 1 + fi +fi + + +exit 0 + +#if [ $stage -le 15 ]; then +# #Online decoding example +# %WER 31.5 | 2120 27224 | 74.0 13.0 13.0 5.5 31.5 77.1 | -0.558 | exp/chain/tdnn_7b_online/decode_dev_aspire_whole_uniformsegmented_win10_over5_v9_online_iterfinal_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys + +# local/nnet3/prep_test_aspire_online.sh --stage 2 --decode-num-jobs 30 --affix "v7" \ +# --acwt 1.0 --post-decode-acwt 10.0 \ +# --window 10 --overlap 5 \ +# --max-count 75 \ +# --pass2-decode-opts "--min-active 1000" \ +# dev_aspire data/lang $dir/graph_pp exp/chain/tdnn_7b +#fi + + + + +exit 0; + +# %WER 32.7 | 2120 27222 | 73.6 15.3 11.2 6.3 32.7 78.5 | -0.530 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter100_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 30.4 | 2120 27211 | 74.8 12.7 12.5 5.1 30.4 77.0 | -0.458 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter200_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys +# %WER 29.1 | 2120 27216 | 76.6 13.8 9.6 5.7 29.1 76.8 | -0.527 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter300_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.8 | 2120 27211 | 77.0 13.8 9.2 5.8 28.8 76.3 | -0.587 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter400_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.7 | 2120 27218 | 77.1 13.8 9.1 5.8 28.7 77.0 | -0.566 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter500_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.5 | 2120 27210 | 77.5 13.9 8.7 6.0 28.5 76.1 | -0.596 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter600_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.2 | 2120 27217 | 77.0 12.4 10.6 5.2 28.2 75.8 | -0.540 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter700_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys +# %WER 28.4 | 2120 27218 | 77.6 13.6 8.8 6.0 28.4 76.3 | -0.607 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter800_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.2 | 2120 27208 | 77.4 12.6 10.0 5.6 28.2 76.6 | -0.555 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter900_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys +# %WER 27.8 | 2120 27214 | 78.0 13.5 8.5 5.9 27.8 75.9 | -0.631 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1000_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 27.9 | 2120 27216 | 77.6 13.0 9.4 5.5 27.9 76.1 | -0.544 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1200_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys +# %WER 27.8 | 2120 27216 | 77.4 13.1 9.5 5.3 27.8 75.7 | -0.615 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1300_pp_fg/score_9/penalty_0.25/ctm.filt.filt.sys +# %WER 27.7 | 2120 27220 | 78.1 13.6 8.3 5.8 27.7 75.1 | -0.569 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1400_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 27.7 | 2120 27217 | 78.1 13.6 8.3 5.9 27.7 75.1 | -0.605 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1500_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys + diff --git a/egs/aspire/s5/local/chain/tuning/run_tdnn_lstm_kl_ts_1b.sh b/egs/aspire/s5/local/chain/tuning/run_tdnn_lstm_kl_ts_1b.sh new file mode 100644 index 00000000000..3c5042df975 --- /dev/null +++ b/egs/aspire/s5/local/chain/tuning/run_tdnn_lstm_kl_ts_1b.sh @@ -0,0 +1,334 @@ +#!/bin/bash + +set -e + +# configs for 'chain' +affix=v8 + +stage=7 # skip ivector extractor training as it is already done for baseline system +train_stage=-10 +get_egs_stage=-10 +nj=70 +max_jobs_run=50 + +exp=exp/semisup300k + +# seed model params +src_dir=exp/semisup300k/chain_norvb/tdnn_lstm_1a_sp +treedir=exp/semisup300k/chain_norvb/tree_bi_a +src_ivector_extractor=exp/nnet3_norvb/extractor + +tdnn_affix=_1a +chain_affix=_semisup_kl + +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +# training options +num_epochs=2 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +label_delay=5 + +# decode options +extra_left_context=50 +extra_right_context=0 +decode_iter= + +# training options +remove_egs=false +common_egs_dir= + +num_data_reps=3 +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat < $lat_dir/uttlist.$n.$nj + #done + + rm -f $lat_dir/lat_tmp.*.{ark,scp} 2>/dev/null + + # Copy the lattices temporarily + norvb_nj=$(cat $norvb_lat_dir/num_jobs) + $train_cmd --max-jobs-run $max_jobs_run JOB=1:$norvb_nj $lat_dir/log/copy_lattices.JOB.log \ + lattice-copy --write-compact=false "ark:gunzip -c $norvb_lat_dir/lat.JOB.gz |" \ + ark,scp:$lat_dir/lat_tmp.JOB.ark,$lat_dir/lat_tmp.JOB.scp || exit 1 + + # Make copies of utterances for perturbed data + for n in `seq 3`; do + cat $lat_dir/lat_tmp.*.scp | awk -v n=$n '{print "rev"n"_"$1" "$2}' + done | sort -k1,1 > $lat_dir/lat_rvb.scp + + # Copy and dump the lattices for perturbed data + $train_cmd --max-jobs-run $max_jobs_run JOB=1:$nj $lat_dir/log/copy_rvb_lattices.JOB.log \ + lattice-copy --write-compact=false \ + "scp:utils/filter_scp.pl data/${train_set}/split$nj/JOB/utt2spk $lat_dir/lat_rvb.scp |" \ + "ark:| gzip -c > $lat_dir/lat.JOB.gz" || exit 1 + + rm $lat_dir/lat_tmp.* $lat_dir/lat_rvb.scp + + echo $nj > $lat_dir/num_jobs + + for f in cmvn_opts final.mdl splice_opts tree frame_subsampling_factor; do + if [ -f $norvb_lat_dir/$f ]; then cp $norvb_lat_dir/$f $lat_dir/$f; fi + done +fi + +ln -sf ../final.mdl $lat_dir/final.mdl + +if [ $stage -le 12 ]; then + steps/best_path_weights.sh --cmd "$decode_cmd" \ + ${norvb_train_data_dir} $decode_lang ${norvb_lat_dir} \ + $src_dir/best_path_${norvb_train_set} +fi + +if [ $stage -le 13 ]; then + norvb_weights_dir=$src_dir/best_path_${norvb_train_set} + norvb_nj=$(cat $norvb_weights_dir/num_jobs) + + mkdir -p $src_dir/best_path_${train_set} + for n in `seq 3`; do + cat $norvb_weights_dir/weights.scp | awk -v n=$n '{print "rev"n"_"$1" "$2}' + done | sort -k1,1 > $src_dir/best_path_${train_set}/weights.scp +fi + +egs_opts="$egs_opts --deriv-weights-scp $src_dir/best_path_${train_set}/weights.scp" + +if [ $stage -le 14 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm4 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm4 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 15 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/aspire-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + mkdir -p $dir/egs + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_ts.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true $egs_opts --max-jobs-run $max_jobs_run" \ + --egs.chunk-width 160,140,110,80 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph_pp +if [ $stage -le 16 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_pp_test $dir $graph_dir +fi + +if [ $stage -le 17 ]; then +#%WER 27.8 | 2120 27217 | 78.2 13.6 8.2 6.0 27.8 75.9 | -0.613 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iterfinal_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys + local/nnet3/prep_test_aspire.sh --stage $test_stage --decode-num-jobs 30 --affix "$affix" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --window 10 --overlap 5 \ + --sub-speaker-frames 6000 --max-count 75 --ivector-scale 0.75 \ + --pass2-decode-opts "--min-active 1000" \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + dev_aspire_ldc data/lang $dir/graph_pp $dir +fi + +#if [ $stage -le 15 ]; then +# #Online decoding example +# %WER 31.5 | 2120 27224 | 74.0 13.0 13.0 5.5 31.5 77.1 | -0.558 | exp/chain/tdnn_7b_online/decode_dev_aspire_whole_uniformsegmented_win10_over5_v9_online_iterfinal_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys + +# local/nnet3/prep_test_aspire_online.sh --stage 2 --decode-num-jobs 30 --affix "v7" \ +# --acwt 1.0 --post-decode-acwt 10.0 \ +# --window 10 --overlap 5 \ +# --max-count 75 \ +# --pass2-decode-opts "--min-active 1000" \ +# dev_aspire data/lang $dir/graph_pp exp/chain/tdnn_7b +#fi + + + + +exit 0; + +# %WER 32.7 | 2120 27222 | 73.6 15.3 11.2 6.3 32.7 78.5 | -0.530 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter100_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 30.4 | 2120 27211 | 74.8 12.7 12.5 5.1 30.4 77.0 | -0.458 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter200_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys +# %WER 29.1 | 2120 27216 | 76.6 13.8 9.6 5.7 29.1 76.8 | -0.527 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter300_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.8 | 2120 27211 | 77.0 13.8 9.2 5.8 28.8 76.3 | -0.587 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter400_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.7 | 2120 27218 | 77.1 13.8 9.1 5.8 28.7 77.0 | -0.566 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter500_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.5 | 2120 27210 | 77.5 13.9 8.7 6.0 28.5 76.1 | -0.596 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter600_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.2 | 2120 27217 | 77.0 12.4 10.6 5.2 28.2 75.8 | -0.540 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter700_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys +# %WER 28.4 | 2120 27218 | 77.6 13.6 8.8 6.0 28.4 76.3 | -0.607 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter800_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.2 | 2120 27208 | 77.4 12.6 10.0 5.6 28.2 76.6 | -0.555 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter900_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys +# %WER 27.8 | 2120 27214 | 78.0 13.5 8.5 5.9 27.8 75.9 | -0.631 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1000_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 27.9 | 2120 27216 | 77.6 13.0 9.4 5.5 27.9 76.1 | -0.544 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1200_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys +# %WER 27.8 | 2120 27216 | 77.4 13.1 9.5 5.3 27.8 75.7 | -0.615 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1300_pp_fg/score_9/penalty_0.25/ctm.filt.filt.sys +# %WER 27.7 | 2120 27220 | 78.1 13.6 8.3 5.8 27.7 75.1 | -0.569 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1400_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 27.7 | 2120 27217 | 78.1 13.6 8.3 5.9 27.7 75.1 | -0.605 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1500_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys + diff --git a/egs/aspire/s5/local/chain/tuning/run_tdnn_lstm_norvb_1a.sh b/egs/aspire/s5/local/chain/tuning/run_tdnn_lstm_norvb_1a.sh new file mode 100755 index 00000000000..94ba563ae0f --- /dev/null +++ b/egs/aspire/s5/local/chain/tuning/run_tdnn_lstm_norvb_1a.sh @@ -0,0 +1,247 @@ +#!/bin/bash +set -e + +# This is fisher chain recipe. + +# configs for 'chain' +affix=1a + +stage=0 +train_stage=-10 +get_egs_stage=-10 +nj=200 + +tdnn_affix=1a +tree_affix=bi_a +nnet3_affix=_norvb +chain_affix=_norvb + +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +# training options +num_epochs=2 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +label_delay=5 + +# decode options +extra_left_context=50 +extra_right_context=0 +decode_iter= + +remove_egs=false +common_egs_dir= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate -1 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 7000 data/${train_set}_sp $lang $lat_dir $treedir || exit 1 +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm4 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm4 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/aspire-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + mkdir -p $dir/egs + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true" \ + --egs.chunk-width 160,140,110,80 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph_pp +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_pp_test $dir $graph_dir +fi + +if [ $stage -le 15 ]; then + rm $dir/.error 2>/dev/null || true + + for d in dev test; do + ( + if [ ! -f exp/nnet3${nnet3_affix}/ivectors_${d}/ivector_online.scp ]; then + steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \ + data/${d}_hires exp/nnet3${nnet3_affix}/extractor \ + exp/nnet3${nnet3_affix}/ivectors_${d} || { echo "Failed i-vector extraction for data/${d}_hires"; touch $dir/.error; } + fi + + decode_dir=$dir/decode_${d}_pp + steps/nnet3/decode.sh --nj 30 --cmd "$decode_cmd" --config conf/decode.config \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + --frames-per-chunk 160 \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${d} \ + $graph_dir data/${d}_hires $decode_dir || { echo "Failed decoding in $decode_dir"; touch $dir/.error; } + ) & + done + wait + + if [ -f $dir/.error ]; then + echo "Failed decoding." + exit 1 + fi +fi +exit 0 + +#if [ $stage -le 16 ]; then +# local/nnet3/prep_test_aspire.sh --stage 1 --decode-num-jobs 30 --affix "$affix" \ +# --acwt 1.0 --post-decode-acwt 10.0 \ +# --window 10 --overlap 5 \ +# --sub-speaker-frames 6000 --max-count 75 --ivector-scale 0.75 \ +# --pass2-decode-opts "--min-active 1000" \ +# dev_aspire data/lang $dir/graph_pp $dir +#fi +#exit 0; diff --git a/egs/aspire/s5/local/chain/tuning/run_tdnn_lstm_semisup_ts_1a.sh b/egs/aspire/s5/local/chain/tuning/run_tdnn_lstm_semisup_ts_1a.sh new file mode 100755 index 00000000000..2994335f71b --- /dev/null +++ b/egs/aspire/s5/local/chain/tuning/run_tdnn_lstm_semisup_ts_1a.sh @@ -0,0 +1,367 @@ +#!/bin/bash + +set -e + +# configs for 'chain' +affix=v8 + +stage=7 # skip ivector extractor training as it is already done for baseline system +train_stage=-10 +get_egs_stage=-10 +nj=70 +max_jobs_run=50 + +# seed model params +src_dir=exp/chain_norvb/tdnn_lstm_1a_sp +treedir=exp/chain_norvb/tree_bi_a +src_ivector_extractor=exp/nnet3_norvb/extractor + +use_transcripts=false +tdnn_affix=_1a +chain_affix=_semisup_ts + +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +# training options +num_epochs=2 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +label_delay=5 + +# decode options +extra_left_context=50 +extra_right_context=0 +decode_iter= + +# training options +num_epochs=4 +remove_egs=false +common_egs_dir= + +num_data_reps=3 +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat < $lat_dir/uttlist.$n.$nj + #done + + rm -f $lat_dir/lat_tmp.*.{ark,scp} 2>/dev/null + + # Copy the lattices temporarily + norvb_nj=$(cat $norvb_lat_dir/num_jobs) + $train_cmd --max-jobs-run $max_jobs_run JOB=1:$norvb_nj $lat_dir/log/copy_lattices.JOB.log \ + lattice-copy --write-compact=false "ark:gunzip -c $norvb_lat_dir/lat.JOB.gz |" \ + ark,scp:$lat_dir/lat_tmp.JOB.ark,$lat_dir/lat_tmp.JOB.scp || exit 1 + + # Make copies of utterances for perturbed data + for n in `seq 3`; do + cat $lat_dir/lat_tmp.*.scp | awk -v n=$n '{print "rev"n"_"$1" "$2}' + done | sort -k1,1 > $lat_dir/lat_rvb.scp + + # Copy and dump the lattices for perturbed data + $train_cmd --max-jobs-run $max_jobs_run JOB=1:$nj $lat_dir/log/copy_rvb_lattices.JOB.log \ + lattice-copy --write-compact=false \ + "scp:utils/filter_scp.pl data/${train_set}/split$nj/JOB/utt2spk $lat_dir/lat_rvb.scp |" \ + "ark:| gzip -c > $lat_dir/lat.JOB.gz" || exit 1 + + rm $lat_dir/lat_tmp.* $lat_dir/lat_rvb.scp + + echo $nj > $lat_dir/num_jobs + + for f in cmvn_opts final.mdl splice_opts tree frame_subsampling_factor; do + if [ -f $norvb_lat_dir/$f ]; then cp $norvb_lat_dir/$f $lat_dir/$f; fi + done +fi + +ln -sf ../final.mdl $lat_dir/final.mdl + +if [ $stage -le 12 ]; then + steps/best_path_weights.sh --cmd "$decode_cmd" \ + ${norvb_train_data_dir} $decode_lang ${norvb_lat_dir} \ + $src_dir/best_path_${norvb_train_set} +fi + +if [ $stage -le 13 ]; then + norvb_weights_dir=$src_dir/best_path_${norvb_train_set} + norvb_nj=$(cat $norvb_weights_dir/num_jobs) + + mkdir -p $src_dir/best_path_${train_set} + for n in `seq 3`; do + cat $norvb_weights_dir/weights.scp | awk -v n=$n '{print "rev"n"_"$1" "$2}' + done | sort -k1,1 > $src_dir/best_path_${train_set}/weights.scp +fi + +egs_opts="$egs_opts --deriv-weights-scp $src_dir/best_path_${train_set}/weights.scp" + +if [ $stage -le 14 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm4 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm4 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 15 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/aspire-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + mkdir -p $dir/egs + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --egs.stage $get_egs_stage --egs.get-egs-script=$get_egs_script \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true $egs_opts --max-jobs-run $max_jobs_run" \ + --chain.right-tolerance 1 --chain.left-tolerance 1 \ + --chain.alignment-subsampling-factor 1 \ + --egs.chunk-width 160,140,110,80 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph_pp +if [ $stage -le 16 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_pp_test $dir $graph_dir +fi + +if [ $stage -le 17 ]; then +#%WER 27.8 | 2120 27217 | 78.2 13.6 8.2 6.0 27.8 75.9 | -0.613 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iterfinal_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys + local/nnet3/prep_test_aspire.sh --stage $test_stage --decode-num-jobs 30 --affix "$affix" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --window 10 --overlap 5 \ + --sub-speaker-frames 6000 --max-count 75 --ivector-scale 0.75 \ + --pass2-decode-opts "--min-active 1000" \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + dev_aspire_ldc data/lang $dir/graph_pp $dir +fi + +#if [ $stage -le 15 ]; then +# #Online decoding example +# %WER 31.5 | 2120 27224 | 74.0 13.0 13.0 5.5 31.5 77.1 | -0.558 | exp/chain/tdnn_7b_online/decode_dev_aspire_whole_uniformsegmented_win10_over5_v9_online_iterfinal_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys + +# local/nnet3/prep_test_aspire_online.sh --stage 2 --decode-num-jobs 30 --affix "v7" \ +# --acwt 1.0 --post-decode-acwt 10.0 \ +# --window 10 --overlap 5 \ +# --max-count 75 \ +# --pass2-decode-opts "--min-active 1000" \ +# dev_aspire data/lang $dir/graph_pp exp/chain/tdnn_7b +#fi + + + + +exit 0; + +# %WER 32.7 | 2120 27222 | 73.6 15.3 11.2 6.3 32.7 78.5 | -0.530 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter100_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 30.4 | 2120 27211 | 74.8 12.7 12.5 5.1 30.4 77.0 | -0.458 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter200_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys +# %WER 29.1 | 2120 27216 | 76.6 13.8 9.6 5.7 29.1 76.8 | -0.527 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter300_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.8 | 2120 27211 | 77.0 13.8 9.2 5.8 28.8 76.3 | -0.587 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter400_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.7 | 2120 27218 | 77.1 13.8 9.1 5.8 28.7 77.0 | -0.566 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter500_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.5 | 2120 27210 | 77.5 13.9 8.7 6.0 28.5 76.1 | -0.596 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter600_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.2 | 2120 27217 | 77.0 12.4 10.6 5.2 28.2 75.8 | -0.540 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter700_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys +# %WER 28.4 | 2120 27218 | 77.6 13.6 8.8 6.0 28.4 76.3 | -0.607 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter800_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.2 | 2120 27208 | 77.4 12.6 10.0 5.6 28.2 76.6 | -0.555 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter900_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys +# %WER 27.8 | 2120 27214 | 78.0 13.5 8.5 5.9 27.8 75.9 | -0.631 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1000_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 27.9 | 2120 27216 | 77.6 13.0 9.4 5.5 27.9 76.1 | -0.544 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1200_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys +# %WER 27.8 | 2120 27216 | 77.4 13.1 9.5 5.3 27.8 75.7 | -0.615 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1300_pp_fg/score_9/penalty_0.25/ctm.filt.filt.sys +# %WER 27.7 | 2120 27220 | 78.1 13.6 8.3 5.8 27.7 75.1 | -0.569 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1400_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 27.7 | 2120 27217 | 78.1 13.6 8.3 5.9 27.7 75.1 | -0.605 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1500_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys diff --git a/egs/aspire/s5/local/fisher_train_lms_pocolm.sh b/egs/aspire/s5/local/fisher_train_lms_pocolm.sh new file mode 100755 index 00000000000..15d2db6fb9d --- /dev/null +++ b/egs/aspire/s5/local/fisher_train_lms_pocolm.sh @@ -0,0 +1,186 @@ +#!/bin/bash + +# Copyright 2016 Vincent Nguyen +# 2016 Johns Hopkins University (author: Daniel Povey) +# 2017 Vimal Manohar +# Apache 2.0 +# +# It is based on the example scripts distributed with PocoLM + +set -e +stage=0 + +text=data/train_all/text +lexicon=data/local/dict/lexicon.txt +dir=data/local/pocolm + +num_ngrams_large=5000000 +num_ngrams_small=2500000 + +echo "$0 $@" # Print the command line for logging +. utils/parse_options.sh || exit 1; + +lm_dir=${dir}/data + +mkdir -p $dir +. ./path.sh || exit 1; # for KALDI_ROOT +export PATH=$KALDI_ROOT/tools/pocolm/scripts:$PATH +( # First make sure the pocolm toolkit is installed. + cd $KALDI_ROOT/tools || exit 1; + if [ -d pocolm ]; then + echo Not installing the pocolm toolkit since it is already there. + else + echo "$0: Please install the PocoLM toolkit with: " + echo " cd ../../../tools; extras/install_pocolm.sh; cd -" + exit 1; + fi +) || exit 1; + +for f in "$text" "$lexicon"; do + [ ! -f $x ] && echo "$0: No such file $f" && exit 1; +done + +num_dev_sentences=10000 + +#bypass_metaparam_optim_opt= +# If you want to bypass the metaparameter optimization steps with specific metaparameters +# un-comment the following line, and change the numbers to some appropriate values. +# You can find the values from output log of train_lm.py. +# These example numbers of metaparameters is for 4-gram model (with min-counts) +# running with train_lm.py. +# The dev perplexity should be close to the non-bypassed model. +#bypass_metaparam_optim_opt="--bypass-metaparameter-optimization=0.854,0.0722,0.5808,0.338,0.166,0.015,0.999,0.6228,0.340,0.172,0.999,0.788,0.501,0.406" +# Note: to use these example parameters, you may need to remove the .done files +# to make sure the make_lm_dir.py be called and tain only 3-gram model +#for order in 3; do +#rm -f ${lm_dir}/${num_word}_${order}.pocolm/.done + +if [ $stage -le 0 ]; then + mkdir -p ${dir}/data + mkdir -p ${dir}/data/text + + echo "$0: Getting the Data sources" + + rm ${dir}/data/text/* 2>/dev/null || true + + cleantext=$dir/text_all.gz + + cut -d ' ' -f 2- $text | awk -v lex=$lexicon ' + BEGIN{ + while((getline0) { seen[$1]=1; } + } + { + for(n=1; n<=NF;n++) { + if (seen[$n]) { + printf("%s ", $n); + } else { + printf(" "); + } + } + printf("\n"); + }' | gzip -c > $cleantext || exit 1; + + # This is for reporting perplexities + gunzip -c $dir/text_all.gz | head -n $num_dev_sentences > \ + ${dir}/data/test.txt + + # use a subset of the annotated training data as the dev set . + # Note: the name 'dev' is treated specially by pocolm, it automatically + # becomes the dev set. + gunzip -c $dir/text_all.gz | tail -n +$[num_dev_sentences+1] | \ + head -n $num_dev_sentences > ${dir}/data/text/dev.txt + + gunzip -c $dir/text_all.gz | tail -n +$[2*num_dev_sentences+1] > \ + ${dir}/data/text/train.txt + + cat $lexicon | awk '{print $1}' | sort | uniq | awk ' + { + if ($1 == "") { + print " is in the vocabulary!" | "cat 1>&2" + exit 1; + } + if ($1 == "") { + print " is in the vocabulary!" | "cat 1>&2" + exit 1; + } + printf("%s\n", $1); + }' > $dir/data/wordlist || exit 1; +fi + +order=4 +wordlist=${dir}/data/wordlist + +lm_name="`basename ${wordlist}`_${order}" +min_counts='train=1' +if [ -n "${min_counts}" ]; then + lm_name+="_`echo ${min_counts} | tr -s "[:blank:]" "_" | tr "=" "-"`" +fi + +unpruned_lm_dir=${lm_dir}/${lm_name}.pocolm + +if [ $stage -le 1 ]; then + # decide on the vocabulary. + # Note: you'd use --wordlist if you had a previously determined word-list + # that you wanted to use. + # Note: if you have more than one order, use a certain amount of words as the + # vocab and want to restrict max memory for 'sort', + echo "$0: training the unpruned LM" + train_lm.py --wordlist=${wordlist} --num-splits=10 --warm-start-ratio=20 \ + --limit-unk-history=true \ + --fold-dev-into=train ${bypass_metaparam_optim_opt} \ + --min-counts="${min_counts}" \ + ${dir}/data/text ${order} ${lm_dir}/work ${unpruned_lm_dir} | tee ${unpruned_lm_dir}/train_lm.log + + get_data_prob.py ${dir}/data/test.txt ${unpruned_lm_dir} 2>&1 | grep -F '[perplexity' | tee ${unpruned_lm_dir}/perplexity_test.log +fi + +if [ $stage -le 2 ]; then + rm ${dir}/data/arpa/${order}gram_big.arpa.gz 2>/dev/null || true + echo "$0: pruning the LM (to larger size)" + # Using 5 million n-grams for a big LM for rescoring purposes. + prune_lm_dir.py --target-num-ngrams=$num_ngrams_large --initial-threshold=0.02 ${unpruned_lm_dir} ${dir}/data/lm_${order}_prune_big \ + 2> >(tee -a ${dir}/data/lm_${order}_prune_big/prune_lm.log >&2) || true + + if [ ! -f ${dir}/data/lm_${order}_prune_big/metaparameters ]; then + grep -q "can not do any pruning" ${dir}/data/lm_${order}_prune_big/prune_lm.log + if [ $? -eq 0 ]; then + echo "$0: LM could not be pruned. Something went wrong!" + exit 1 + fi + + mkdir -p ${dir}/data/arpa + format_arpa_lm.py ${unpruned_lm_dir} | gzip -c > ${dir}/data/arpa/${order}gram_small.arpa.gz + echo "$0: No pruning necessary as num-ngrams is less than target" + exit 0 + fi + + get_data_prob.py ${dir}/data/test.txt ${dir}/data/lm_${order}_prune_big 2>&1 | grep -F '[perplexity' | tee ${dir}/data/lm_${order}_prune_big/perplexity_test.log + + mkdir -p ${dir}/data/arpa + format_arpa_lm.py ${dir}/data/lm_${order}_prune_big | gzip -c > ${dir}/data/arpa/${order}gram_big.arpa.gz +fi + +if [ $stage -le 3 ]; then + rm ${dir}/data/arpa/${order}gram_small.arpa.gz 2>/dev/null || true + echo "$0: pruning the LM (to smaller size)" + # Using 3 million n-grams for a smaller LM for graph building. Prune from the + # bigger-pruned LM, it'll be faster. + prune_lm_dir.py --target-num-ngrams=$num_ngrams_small ${dir}/data/lm_${order}_prune_big ${dir}/data/lm_${order}_prune_small \ + 2> >(tee -a ${dir}/data/lm_${order}_prune_small/prune_lm.log >&2) || true + + if [ ! -f ${dir}/data/lm_${order}_prune_small/metaparameters ]; then + grep -q "can not do any pruning" ${dir}/data/lm_${order}_prune_small/prune_lm.log + if [ $? -eq 0 ]; then + echo "$0: LM could not be pruned. Something went wrong!" + exit 1 + fi + + ln -s ${order}gram_big.arpa.gz $dir/data/arpa/${order}gram_small.arpa.gz + exit 0 + fi + + + get_data_prob.py ${dir}/data/test.txt ${dir}/data/lm_${order}_prune_small 2>&1 | grep -F '[perplexity' | tee ${dir}/data/lm_${order}_prune_small/perplexity_test.log + + format_arpa_lm.py ${dir}/data/lm_${order}_prune_small | gzip -c > ${dir}/data/arpa/${order}gram_small.arpa.gz +fi From 524ae099435dbd99127ad2835b7226a35d2220d3 Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Wed, 1 Aug 2018 12:13:46 -0400 Subject: [PATCH 167/174] Some aspire recipes --- .../s5/local/chain/tuning/run_tdnn_lstm_1b.sh | 289 ++++++++++++++++ .../chain/tuning/run_tdnn_lstm_norvb_1a.sh | 247 ++++++++++++++ .../tuning/run_tdnn_lstm_semisup_ts_1a.sh | 311 ++++++++++++++++++ 3 files changed, 847 insertions(+) create mode 100755 egs/aspire/s5/local/chain/tuning/run_tdnn_lstm_1b.sh create mode 100755 egs/aspire/s5/local/chain/tuning/run_tdnn_lstm_norvb_1a.sh create mode 100755 egs/aspire/s5/local/chain/tuning/run_tdnn_lstm_semisup_ts_1a.sh diff --git a/egs/aspire/s5/local/chain/tuning/run_tdnn_lstm_1b.sh b/egs/aspire/s5/local/chain/tuning/run_tdnn_lstm_1b.sh new file mode 100755 index 00000000000..0192e0ce1fe --- /dev/null +++ b/egs/aspire/s5/local/chain/tuning/run_tdnn_lstm_1b.sh @@ -0,0 +1,289 @@ +#!/bin/bash + +set -e + +# based on run_tdnn_7b.sh in the swbd recipe + +# configs for 'chain' +affix=v8 + +stage=0 +train_stage=-10 +get_egs_stage=-10 + +tdnn_affix=1a +tree_affix=bi_a +chain_affix= + +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +# training options +num_epochs=2 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +label_delay=5 + +# decode options +extra_left_context=50 +extra_right_context=0 +decode_iter= + +# training options +num_epochs=4 +remove_egs=false +common_egs_dir= + +num_data_reps=3 +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat < $lat_dir/uttlist.$n.$nj + done + + norvb_lat_dir=exp/chain${chain_affix}/tri5a_train_lats + + rm -f $lat_dir/lat_tmp.*.{ark,scp} 2>/dev/null + + norvb_nj=$(cat $norvb_lat_dir/num_jobs) + $train_cmd JOB=1:$norvb_nj $lat_dir/JOB/copy_lattices.JOB.log \ + lattice-copy "ark:gunzip -c $norvb_lat_dir/lat.JOB.gz |" \ + ark,scp:$lat_dir/lat_tmp.JOB.ark,$lat_dir/lat_tmp.JOB.scp || exit 1 + + for n in `seq 3`; do + cat $lat_dir/lat_tmp.*.scp | awk '{print "rev"n"_"$1" "$2}' + done > $lat_dir/lat_rvb.scp + + $train_cmd JOB=1:$nj $lat_dir/JOB/copy_rvb_lattices.JOB.log \ + "scp:utils/filter_scp.pl data/${train_set}/split$nj/JOB/utt2spk $lat_dir/lat_rvb.scp |" \ + "ark:gzip -c > $lat_dir/lat.JOB.gz |" || exit 1 + + rm $lat_dir/lat_tmp.* $lat_dir/lat_rvb.scp + + echo $nj > $lat_dir/num_jobs + + for f in cmvn_opts final.mdl splice_opts tree; do + cp $norvb_lat_dir/$f $lat_dir/$f + done +fi + +if [ $stage -le 10 ]; then + # Create a version of the lang/ directory that has one state per phone in the + # topo file. [note, it really has two states.. the first one is only repeated + # once, the second one has zero or more repeats.] + rm -rf $lang + cp -r data/lang $lang + silphonelist=$(cat $lang/phones/silence.csl) || exit 1; + nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1; + # Use our special topology... note that later on may have to tune this + # topology. + steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + # we build the tree using clean features (data/train) rather than + # the augmented features (data/train_rvb) to get better alignments + + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate -1 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 7000 data/train $lang exp/tri5a $treedir || exit 1 +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm4 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm4 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/aspire-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + mkdir -p $dir/egs + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true" \ + --egs.chunk-width 160,140,110,80 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph_pp +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_pp_test $dir $graph_dir +fi + +if [ $stage -le 14 ]; then +#%WER 27.8 | 2120 27217 | 78.2 13.6 8.2 6.0 27.8 75.9 | -0.613 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iterfinal_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys + local/nnet3/prep_test_aspire.sh --stage 1 --decode-num-jobs 30 --affix "$affix" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --window 10 --overlap 5 \ + --sub-speaker-frames 6000 --max-count 75 --ivector-scale 0.75 \ + --pass2-decode-opts "--min-active 1000" \ + dev_aspire data/lang $dir/graph_pp $dir +fi + +#if [ $stage -le 15 ]; then +# #Online decoding example +# %WER 31.5 | 2120 27224 | 74.0 13.0 13.0 5.5 31.5 77.1 | -0.558 | exp/chain/tdnn_7b_online/decode_dev_aspire_whole_uniformsegmented_win10_over5_v9_online_iterfinal_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys + +# local/nnet3/prep_test_aspire_online.sh --stage 2 --decode-num-jobs 30 --affix "v7" \ +# --acwt 1.0 --post-decode-acwt 10.0 \ +# --window 10 --overlap 5 \ +# --max-count 75 \ +# --pass2-decode-opts "--min-active 1000" \ +# dev_aspire data/lang $dir/graph_pp exp/chain/tdnn_7b +#fi + + + + +exit 0; + +# %WER 32.7 | 2120 27222 | 73.6 15.3 11.2 6.3 32.7 78.5 | -0.530 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter100_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 30.4 | 2120 27211 | 74.8 12.7 12.5 5.1 30.4 77.0 | -0.458 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter200_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys +# %WER 29.1 | 2120 27216 | 76.6 13.8 9.6 5.7 29.1 76.8 | -0.527 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter300_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.8 | 2120 27211 | 77.0 13.8 9.2 5.8 28.8 76.3 | -0.587 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter400_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.7 | 2120 27218 | 77.1 13.8 9.1 5.8 28.7 77.0 | -0.566 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter500_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.5 | 2120 27210 | 77.5 13.9 8.7 6.0 28.5 76.1 | -0.596 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter600_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.2 | 2120 27217 | 77.0 12.4 10.6 5.2 28.2 75.8 | -0.540 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter700_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys +# %WER 28.4 | 2120 27218 | 77.6 13.6 8.8 6.0 28.4 76.3 | -0.607 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter800_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.2 | 2120 27208 | 77.4 12.6 10.0 5.6 28.2 76.6 | -0.555 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter900_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys +# %WER 27.8 | 2120 27214 | 78.0 13.5 8.5 5.9 27.8 75.9 | -0.631 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1000_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 27.9 | 2120 27216 | 77.6 13.0 9.4 5.5 27.9 76.1 | -0.544 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1200_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys +# %WER 27.8 | 2120 27216 | 77.4 13.1 9.5 5.3 27.8 75.7 | -0.615 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1300_pp_fg/score_9/penalty_0.25/ctm.filt.filt.sys +# %WER 27.7 | 2120 27220 | 78.1 13.6 8.3 5.8 27.7 75.1 | -0.569 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1400_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 27.7 | 2120 27217 | 78.1 13.6 8.3 5.9 27.7 75.1 | -0.605 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1500_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys diff --git a/egs/aspire/s5/local/chain/tuning/run_tdnn_lstm_norvb_1a.sh b/egs/aspire/s5/local/chain/tuning/run_tdnn_lstm_norvb_1a.sh new file mode 100755 index 00000000000..94ba563ae0f --- /dev/null +++ b/egs/aspire/s5/local/chain/tuning/run_tdnn_lstm_norvb_1a.sh @@ -0,0 +1,247 @@ +#!/bin/bash +set -e + +# This is fisher chain recipe. + +# configs for 'chain' +affix=1a + +stage=0 +train_stage=-10 +get_egs_stage=-10 +nj=200 + +tdnn_affix=1a +tree_affix=bi_a +nnet3_affix=_norvb +chain_affix=_norvb + +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +# training options +num_epochs=2 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +label_delay=5 + +# decode options +extra_left_context=50 +extra_right_context=0 +decode_iter= + +remove_egs=false +common_egs_dir= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate -1 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 7000 data/${train_set}_sp $lang $lat_dir $treedir || exit 1 +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm4 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm4 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/aspire-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + mkdir -p $dir/egs + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true" \ + --egs.chunk-width 160,140,110,80 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph_pp +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_pp_test $dir $graph_dir +fi + +if [ $stage -le 15 ]; then + rm $dir/.error 2>/dev/null || true + + for d in dev test; do + ( + if [ ! -f exp/nnet3${nnet3_affix}/ivectors_${d}/ivector_online.scp ]; then + steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \ + data/${d}_hires exp/nnet3${nnet3_affix}/extractor \ + exp/nnet3${nnet3_affix}/ivectors_${d} || { echo "Failed i-vector extraction for data/${d}_hires"; touch $dir/.error; } + fi + + decode_dir=$dir/decode_${d}_pp + steps/nnet3/decode.sh --nj 30 --cmd "$decode_cmd" --config conf/decode.config \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + --frames-per-chunk 160 \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${d} \ + $graph_dir data/${d}_hires $decode_dir || { echo "Failed decoding in $decode_dir"; touch $dir/.error; } + ) & + done + wait + + if [ -f $dir/.error ]; then + echo "Failed decoding." + exit 1 + fi +fi +exit 0 + +#if [ $stage -le 16 ]; then +# local/nnet3/prep_test_aspire.sh --stage 1 --decode-num-jobs 30 --affix "$affix" \ +# --acwt 1.0 --post-decode-acwt 10.0 \ +# --window 10 --overlap 5 \ +# --sub-speaker-frames 6000 --max-count 75 --ivector-scale 0.75 \ +# --pass2-decode-opts "--min-active 1000" \ +# dev_aspire data/lang $dir/graph_pp $dir +#fi +#exit 0; diff --git a/egs/aspire/s5/local/chain/tuning/run_tdnn_lstm_semisup_ts_1a.sh b/egs/aspire/s5/local/chain/tuning/run_tdnn_lstm_semisup_ts_1a.sh new file mode 100755 index 00000000000..40cc9b76ced --- /dev/null +++ b/egs/aspire/s5/local/chain/tuning/run_tdnn_lstm_semisup_ts_1a.sh @@ -0,0 +1,311 @@ +#!/bin/bash + +set -e + +# configs for 'chain' +affix=v8 + +stage=7 # skip ivector extractor training as it is already done for baseline system +train_stage=-10 +get_egs_stage=-10 +nj=70 + +# seed model params +src_dir=exp/chain_norvb/tdnn_lstm_1a +treedir=exp/chain_norvb/tree_bi_a +src_ivector_dir=exp/nnet3_norvb/ivectors_train_sp + +tdnn_affix=1a +chain_affix=_semisup_ts + +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +# training options +num_epochs=2 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +label_delay=5 + +# decode options +extra_left_context=50 +extra_right_context=0 +decode_iter= + +# training options +num_epochs=4 +remove_egs=false +common_egs_dir= + +num_data_reps=3 +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat < $lat_dir/uttlist.$n.$nj + done + + rm -f $lat_dir/lat_tmp.*.{ark,scp} 2>/dev/null + + norvb_nj=$(cat $norvb_lat_dir/num_jobs) + $train_cmd JOB=1:$norvb_nj $lat_dir/JOB/copy_lattices.JOB.log \ + lattice-copy "ark:gunzip -c $norvb_lat_dir/lat.JOB.gz |" \ + ark,scp:$lat_dir/lat_tmp.JOB.ark,$lat_dir/lat_tmp.JOB.scp || exit 1 + + for n in `seq 3`; do + cat $lat_dir/lat_tmp.*.scp | awk '{print "rev"n"_"$1" "$2}' + done > $lat_dir/lat_rvb.scp + + $train_cmd JOB=1:$nj $lat_dir/JOB/copy_rvb_lattices.JOB.log \ + "scp:utils/filter_scp.pl data/${train_set}/split$nj/JOB/utt2spk $lat_dir/lat_rvb.scp |" \ + "ark:gzip -c > $lat_dir/lat.JOB.gz |" || exit 1 + + rm $lat_dir/lat_tmp.* $lat_dir/lat_rvb.scp + + echo $nj > $lat_dir/num_jobs + + for f in cmvn_opts final.mdl splice_opts tree frame_subsampling_factor; do + if [ -f $norvb_lat_dir/$f ]; then cp $norvb_lat_dir/$f $lat_dir/$f; fi + done + + get_egs_script=steps/nnet3/chain/get_egs_split.sh + egs_opts="--lattice-lm-scale 0.5 --lattice-beam 4.0 --tolerance 1" +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm4 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm4 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/aspire-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + mkdir -p $dir/egs + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --egs.stage $get_egs_stage --egs.get-egs-script=$get_egs_script \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true $egs_opts" \ + --egs.chunk-width 160,140,110,80 \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph_pp +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_pp_test $dir $graph_dir +fi + +if [ $stage -le 14 ]; then +#%WER 27.8 | 2120 27217 | 78.2 13.6 8.2 6.0 27.8 75.9 | -0.613 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iterfinal_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys + local/nnet3/prep_test_aspire.sh --stage 1 --decode-num-jobs 30 --affix "$affix" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --window 10 --overlap 5 \ + --sub-speaker-frames 6000 --max-count 75 --ivector-scale 0.75 \ + --pass2-decode-opts "--min-active 1000" \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + dev_aspire data/lang $dir/graph_pp $dir +fi + +#if [ $stage -le 15 ]; then +# #Online decoding example +# %WER 31.5 | 2120 27224 | 74.0 13.0 13.0 5.5 31.5 77.1 | -0.558 | exp/chain/tdnn_7b_online/decode_dev_aspire_whole_uniformsegmented_win10_over5_v9_online_iterfinal_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys + +# local/nnet3/prep_test_aspire_online.sh --stage 2 --decode-num-jobs 30 --affix "v7" \ +# --acwt 1.0 --post-decode-acwt 10.0 \ +# --window 10 --overlap 5 \ +# --max-count 75 \ +# --pass2-decode-opts "--min-active 1000" \ +# dev_aspire data/lang $dir/graph_pp exp/chain/tdnn_7b +#fi + + + + +exit 0; + +# %WER 32.7 | 2120 27222 | 73.6 15.3 11.2 6.3 32.7 78.5 | -0.530 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter100_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 30.4 | 2120 27211 | 74.8 12.7 12.5 5.1 30.4 77.0 | -0.458 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter200_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys +# %WER 29.1 | 2120 27216 | 76.6 13.8 9.6 5.7 29.1 76.8 | -0.527 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter300_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.8 | 2120 27211 | 77.0 13.8 9.2 5.8 28.8 76.3 | -0.587 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter400_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.7 | 2120 27218 | 77.1 13.8 9.1 5.8 28.7 77.0 | -0.566 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter500_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.5 | 2120 27210 | 77.5 13.9 8.7 6.0 28.5 76.1 | -0.596 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter600_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.2 | 2120 27217 | 77.0 12.4 10.6 5.2 28.2 75.8 | -0.540 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter700_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys +# %WER 28.4 | 2120 27218 | 77.6 13.6 8.8 6.0 28.4 76.3 | -0.607 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter800_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 28.2 | 2120 27208 | 77.4 12.6 10.0 5.6 28.2 76.6 | -0.555 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter900_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys +# %WER 27.8 | 2120 27214 | 78.0 13.5 8.5 5.9 27.8 75.9 | -0.631 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1000_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 27.9 | 2120 27216 | 77.6 13.0 9.4 5.5 27.9 76.1 | -0.544 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1200_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys +# %WER 27.8 | 2120 27216 | 77.4 13.1 9.5 5.3 27.8 75.7 | -0.615 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1300_pp_fg/score_9/penalty_0.25/ctm.filt.filt.sys +# %WER 27.7 | 2120 27220 | 78.1 13.6 8.3 5.8 27.7 75.1 | -0.569 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1400_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys +# %WER 27.7 | 2120 27217 | 78.1 13.6 8.3 5.9 27.7 75.1 | -0.605 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iter1500_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys From 865ae0fe36c1f7049f25e98a7c53d70dbc6ec536 Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Mon, 20 Aug 2018 17:01:52 -0400 Subject: [PATCH 168/174] Change the way chain egs normalization is done and sMBR supervision --- egs/wsj/s5/steps/nnet3/chain/get_egs_split.sh | 21 +- src/chain/chain-supervision-splitter.cc | 211 +++++------------- src/chain/chain-supervision-splitter.h | 37 +-- src/chain/chain-training.cc | 15 ++ src/chain/chain-training.h | 12 +- src/chainbin/Makefile | 3 +- .../nnet3-chain-compute-numerator-post.cc | 161 +++++++++++++ src/chainbin/nnet3-chain-split-and-get-egs.cc | 27 ++- 8 files changed, 289 insertions(+), 198 deletions(-) create mode 100644 src/chainbin/nnet3-chain-compute-numerator-post.cc diff --git a/egs/wsj/s5/steps/nnet3/chain/get_egs_split.sh b/egs/wsj/s5/steps/nnet3/chain/get_egs_split.sh index 4a1f9d7c982..a2054529797 100755 --- a/egs/wsj/s5/steps/nnet3/chain/get_egs_split.sh +++ b/egs/wsj/s5/steps/nnet3/chain/get_egs_split.sh @@ -86,6 +86,7 @@ acwt=0.1 # For pruning phone_insertion_penalty= deriv_weights_scp= generate_egs_scp=false +use_den_fst=false echo "$0 $@" # Print the command line for logging @@ -106,7 +107,7 @@ if [ $# != 4 ]; then echo " --max-jobs-run # The maximum number of jobs you want to run in" echo " # parallel (increase this only if you have good disk and" echo " # network speed). default=6" - echo " --cmd (utils/run.pl;utils/queue.pl ) # how to run jobs." +l echo " --cmd (utils/run.pl;utils/queue.pl ) # how to run jobs." echo " --frames-per-iter <#samples;400000> # Number of frames of data to process per iteration, per" echo " # process." echo " --frame-subsampling-factor # factor by which num-frames at nnet output is reduced " @@ -303,7 +304,7 @@ if [ ! -z "$lattice_lm_scale" ]; then chain_supervision_all_opts="$chain_supervision_all_opts --supervision.lm-scale=$lattice_lm_scale" normalization_fst_scale=$(perl -e " - if ($lattice_lm_scale >= 1.0 || $lattice_lm_scale < 0) { + if ($lattice_lm_scale > 1.0 || $lattice_lm_scale < 0) { print STDERR \"Invalid --lattice-lm-scale $lattice_lm_scale\"; exit(1); } @@ -351,6 +352,10 @@ if [ -z "$graph_posterior_rspecifier" ]; then fi fi +if $use_den_fst; then + chain_supervision_all_opts="--den-fst=`dirname $dir`/den.fst" +fi + if [ $stage -le 2 ]; then echo "$0: Getting validation and training subset examples in background." rm $dir/.error 2>/dev/null @@ -466,6 +471,11 @@ if [ $stage -le 5 ]; then egs_list="$egs_list $dir/cegs_orig.$n.JOB.ark" done + normalize_egs=true + if $use_den_fst || $add_numerator_post; then + normalize_egs=false + fi + if [ $archives_multiple == 1 ]; then # normal case. if $generate_egs_scp; then output_archive="ark,scp:$dir/cegs.JOB.ark,$dir/cegs.JOB.scp" @@ -473,9 +483,10 @@ if [ $stage -le 5 ]; then output_archive="ark:$dir/cegs.JOB.ark" fi - if ! $add_numerator_post; then + if $normalize_egs; then $cmd --max-jobs-run $max_shuffle_jobs_run --mem 8G JOB=1:$num_archives_intermediate $dir/log/shuffle.JOB.log \ - nnet3-chain-normalize-egs --normalization-fst-scale=$normalization_fst_scale $chaindir/normalization.fst "ark:cat $egs_list|" ark:- \| \ + nnet3-chain-normalize-egs --normalization-fst-scale=$normalization_fst_scale \ + $chaindir/normalization.fst "ark:cat $egs_list|" ark:- \| \ nnet3-chain-shuffle-egs --srand=\$[JOB+$srand] ark:- $output_archive || exit 1; else $cmd --max-jobs-run $max_shuffle_jobs_run --mem 8G JOB=1:$num_archives_intermediate $dir/log/shuffle.JOB.log \ @@ -509,7 +520,7 @@ if [ $stage -le 5 ]; then ln -sf cegs.$archive_index.ark $dir/cegs.$x.$y.ark || exit 1 done done - if ! $add_numerator_post; then + if $normalize_egs; then $cmd --max-jobs-run $max_shuffle_jobs_run --mem 8G JOB=1:$num_archives_intermediate $dir/log/shuffle.JOB.log \ nnet3-chain-normalize-egs --normalization-fst-scale=$normalization_fst_scale $chaindir/normalization.fst "ark:cat $egs_list|" ark:- \| \ nnet3-chain-shuffle-egs --srand=\$[JOB+$srand] ark:- ark:- \| \ diff --git a/src/chain/chain-supervision-splitter.cc b/src/chain/chain-supervision-splitter.cc index 98c40eb8851..0f9cf3fb357 100644 --- a/src/chain/chain-supervision-splitter.cc +++ b/src/chain/chain-supervision-splitter.cc @@ -58,45 +58,22 @@ void FstToLattice(const fst::StdVectorFst &fst, Lattice *lat) { } } -/** This function converts lattice to FSA with weight equal to - sum of acoustic and language score, and pdf_id + 1 as labels. - This assumes that the acoustic and language scores are scaled appropriately. +/** This function converts lattice to one with pdf_id + 1 as olabels. + This assumes that the ilabels are transition_ids. */ void ConvertLatticeToPdfLabels( - const TransitionModel &tmodel, - const Lattice &ifst, - fst::StdVectorFst *ofst) { - typedef fst::ArcTpl ArcIn; - typedef fst::StdArc ArcOut; - typedef ArcIn::StateId StateId; - ofst->DeleteStates(); - // The states will be numbered exactly the same as the original FST. - // Add the states to the new FST. - StateId num_states = ifst.NumStates(); - for (StateId s = 0; s < num_states; s++) - ofst->AddState(); - ofst->SetStart(ifst.Start()); + const TransitionModel &tmodel, Lattice *lat) { + typedef LatticeArc::StateId StateId; + StateId num_states = lat->NumStates(); for (StateId s = 0; s < num_states; s++) { - LatticeWeight final_iweight = ifst.Final(s); - if (final_iweight != LatticeWeight::Zero()) { - fst::TropicalWeight final_oweight; - ConvertLatticeWeight(final_iweight, &final_oweight); - ofst->SetFinal(s, final_oweight); - } - for (fst::ArcIterator iter(ifst, s); - !iter.Done(); - iter.Next()) { - const ArcIn &arc = iter.Value(); - KALDI_PARANOID_ASSERT(arc.weight != LatticeWeight::Zero()); - ArcOut oarc; - ConvertLatticeWeight(arc.weight, &oarc.weight); + for (fst::MutableArcIterator iter(lat, s); + !iter.Done(); iter.Next()) { + LatticeArc arc = iter.Value(); if (arc.ilabel == 0) - oarc.ilabel = 0; // epsilon arc + arc.olabel = 0; // epsilon arc else - oarc.ilabel = tmodel.TransitionIdToPdf(arc.ilabel) + 1; // pdf + 1 - oarc.olabel = oarc.ilabel; - oarc.nextstate = arc.nextstate; - ofst->AddArc(s, oarc); + arc.olabel = tmodel.TransitionIdToPdf(arc.ilabel) + 1; // pdf + 1 + iter.SetValue(arc); } } } @@ -105,15 +82,21 @@ bool LatticeToNumeratorPost(const Lattice &lat, const TransitionModel &trans_model, const fst::StdVectorFst &fst, Posterior *post, std::string key) { - fst::StdVectorFst sup_fst; - ConvertLatticeToPdfLabels(trans_model, lat, &sup_fst); + Lattice pdf_lat = lat; + ConvertLatticeToPdfLabels(trans_model, &pdf_lat); - if (!AddWeightToFst(fst, &sup_fst)) { - if (!key.empty()) - KALDI_WARN << "For key " << key << ", "; - KALDI_WARN << "FST was empty after composing with FST. " - << "This should be extremely rare (a few per corpus, at most)"; - return false; + fst::Project(&pdf_lat, fst::PROJECT_OUTPUT); + fst::StdVectorFst sup_fst; + ConvertLattice(pdf_lat, &sup_fst); + + if (fst.NumStates() > 0) { + if (!AddWeightToFst(fst, &sup_fst)) { + if (!key.empty()) + KALDI_WARN << "For key " << key << ", "; + KALDI_WARN << "FST was empty after composing with FST. " + << "This should be extremely rare (a few per corpus, at most)"; + return false; + } } // Convert fst to lattice to extract posterior using forward backward. @@ -134,12 +117,7 @@ SupervisionLatticeSplitter::SupervisionLatticeSplitter( const SupervisionLatticeSplitterOptions &opts, const SupervisionOptions &sup_opts, const TransitionModel &trans_model): - sup_opts_(sup_opts), opts_(opts), trans_model_(trans_model), - incomplete_phone_(trans_model.NumPhones() + 1) { - - if (opts_.add_partial_unk_label_left) { - MakeFilterFst(); - } + sup_opts_(sup_opts), opts_(opts), trans_model_(trans_model) { if (opts_.convert_to_unconstrained) { KALDI_WARN << "--convert-to-unconstrained=true; " @@ -189,8 +167,14 @@ bool SupervisionLatticeSplitter::GetFrameRangeSupervision( *out_lat = lat_out; } - // Apply lm-scale on the lattice and remove the acoustic costs - ScaleLattice(fst::LatticeScale(sup_opts_.lm_scale, 0.0), &lat_out); + if (den_fst_.NumStates() == 0) { + // Apply lm-scale on the lattice and remove the acoustic costs + ScaleLattice(fst::LatticeScale(sup_opts_.lm_scale, 0.0), &lat_out); + } else { + // Otherwise the lm_scale has already been applied. So just remove the + // acoustic costs. + ScaleLattice(fst::LatticeScale(1.0, 0.0), &lat_out); + } supervision->frames_per_sequence = num_frames; return GetSupervision(lat_out, supervision); @@ -244,6 +228,22 @@ void SupervisionLatticeSplitter::PrepareLattice() { fst::ScaleLattice(fst::AcousticLatticeScale( opts_.acoustic_scale), &lat_); + if (den_fst_.NumStates() > 0) { + ScaleLattice(fst::GraphLatticeScale(sup_opts_.lm_scale), &lat_); + Lattice lat_out = lat_; + ConvertLatticeToPdfLabels(trans_model_, &lat_out); + // Now ilabel is transition-id, olabel is pdf-id+1. + // So we can compose the denominator fst on the right. + + // Note: den_fst_ is already scaled by 1.0 - lm_scale + Lattice den_lat; + FstToLattice(den_fst_, &den_lat); + fst::ArcSort(&den_lat, fst::ILabelCompare()); + + fst::Compose(lat_out, den_lat, &lat_); + // In lat_, ilabel is transition-id, olabel is pdf-id+1 + } + KALDI_ASSERT(fst::TopSort(&lat_)); LatticeStateTimes(lat_, &(lat_scores_.state_times)); int32 num_states = lat_.NumStates(); @@ -292,7 +292,7 @@ void SupervisionLatticeSplitter::CreateRangeLattice( // that frame index. KALDI_ASSERT(end_iter[-1] < end_frame && (end_iter < state_times.end() || *end_iter == end_frame)); - + StateId begin_state = begin_iter - state_times.begin(), end_state = end_iter - state_times.begin(); @@ -312,10 +312,7 @@ void SupervisionLatticeSplitter::CreateRangeLattice( // Add the special final-state. StateId final_state = out_lat->AddState(); out_lat->SetFinal(final_state, LatticeWeight::One()); - - StateId prefinal_state = final_state + 1; - bool need_prefinal_state = false; - + for (StateId state = begin_state; state < end_state; state++) { StateId output_state = state - begin_state + 1; if (state_times[state] == begin_frame) { @@ -359,65 +356,17 @@ void SupervisionLatticeSplitter::CreateRangeLattice( // Note: We don't normalize here because that is already done with the // initial cost. - if (!opts_.add_partial_unk_label_left) { - out_lat->AddArc(output_state, - LatticeArc(arc.ilabel, arc.olabel, weight, final_state)); - } else { - fst::ArcIterator next_aiter(lat_, nextstate); - if (!next_aiter.Done() && next_aiter.Value().olabel == 0) { - // This is a split in the middle of a phone. - // So add an arc to the "prefinal state" from which there - // is an arc to the "final state" with special - // "incomplete phone" symbol on the output-label. - - if (!need_prefinal_state) { - prefinal_state = out_lat->AddState(); - need_prefinal_state = true; - } - - out_lat->AddArc(output_state, - LatticeArc(arc.ilabel, arc.olabel, weight, prefinal_state)); - } else { - out_lat->AddArc(output_state, - LatticeArc(arc.ilabel, arc.olabel, weight, final_state)); - } - } + out_lat->AddArc(output_state, + LatticeArc(arc.ilabel, arc.olabel, weight, final_state)); } else { StateId output_nextstate = nextstate - begin_state + 1; - Label olabel = arc.olabel; - - if (state_times[state] == begin_frame && - (opts_.add_partial_phone_label_right || - opts_.add_partial_unk_label_right)) { - int32 tid = arc.ilabel; - int32 phone = trans_model_.TransitionIdToPhone(tid); - - if (opts_.add_partial_unk_label_right) { - KALDI_ASSERT(opts_.unk_phone > 0); - phone = opts_.unk_phone; - } - - if (olabel == 0) { - // This is a split in the middle of a phone. - // So add a phone label as the output label. - olabel = phone; - } - } out_lat->AddArc(output_state, - LatticeArc(arc.ilabel, olabel, arc.weight, output_nextstate)); + LatticeArc(arc.ilabel, arc.olabel, arc.weight, output_nextstate)); } } } - - if (need_prefinal_state) { - // Add an "incomplete phone" label as the output symbol in the - // last arc - out_lat->AddArc(prefinal_state, - LatticeArc(0, incomplete_phone_, LatticeWeight::One(), - final_state)); - } - + KALDI_ASSERT(out_lat->Start() == 0); if (opts_.debug) { @@ -456,7 +405,7 @@ void SupervisionLatticeSplitter::CreateRangeLattice( fst::ScaleLattice(fst::AcousticLatticeScale(0), &full_lat); ConvertLattice(full_lat, &full_fst); WriteFstKaldi(std::cerr, false, full_fst); - + fst::StdVectorFst split_fst; fst::ScaleLattice(fst::AcousticLatticeScale(0), out_lat); ConvertLattice(*out_lat, &split_fst); @@ -470,27 +419,6 @@ void SupervisionLatticeSplitter::CreateRangeLattice( } void SupervisionLatticeSplitter::PostProcessLattice(Lattice *out_lat) const { - if (opts_.add_partial_unk_label_left) { - if (opts_.debug && GetVerboseLevel() > 2) { - WriteLattice(std::cerr, false, *out_lat); - } - - fst::TableComposeOptions compose_opts; - compose_opts.table_match_type = fst::MATCH_OUTPUT; - - Lattice filter_lat; - FstToLattice(filter_fst_, &filter_lat); - - Lattice temp_lat; - TableCompose(*out_lat, filter_lat, &temp_lat); - - std::swap(temp_lat, *out_lat); - - if (opts_.debug && GetVerboseLevel() > 2) { - WriteLattice(std::cerr, false, *out_lat); - } - } - fst::RmEpsilon(out_lat); if (opts_.acoustic_scale != 1.0) { @@ -790,31 +718,6 @@ void GetToleranceEnforcerFst(const SupervisionOptions &sup_opts, creator.MakeFst(); } -void SupervisionLatticeSplitter::MakeFilterFst() { - filter_fst_.DeleteStates(); - filter_fst_.AddState(); - filter_fst_.AddState(); - filter_fst_.AddState(); - - filter_fst_.SetStart(0); - - const std::vector &phones = trans_model_.GetPhones(); - for (std::vector::const_iterator it = phones.begin(); - it != phones.end(); ++it) { - filter_fst_.AddArc(0, fst::StdArc(*it, *it, - fst::TropicalWeight::One(), 0)); - filter_fst_.AddArc(0, fst::StdArc(*it, opts_.unk_phone, - fst::TropicalWeight::One(), 1)); - } - filter_fst_.AddArc(1, fst::StdArc(incomplete_phone_, 0, - fst::TropicalWeight::One(), 2)); - - filter_fst_.SetFinal(0, fst::TropicalWeight::One()); - filter_fst_.SetFinal(2, fst::TropicalWeight::One()); - - fst::ArcSort(&filter_fst_, fst::ILabelCompare()); -} - /* bool PhoneLatticeToSupervision(const fst::StdVectorFst &tolerance_fst, const TransitionModel &trans_model, diff --git a/src/chain/chain-supervision-splitter.h b/src/chain/chain-supervision-splitter.h index 8d59f01a97f..b101e97a30a 100644 --- a/src/chain/chain-supervision-splitter.h +++ b/src/chain/chain-supervision-splitter.h @@ -35,21 +35,11 @@ typedef fst::VectorFst Lattice; struct SupervisionLatticeSplitterOptions { BaseFloat acoustic_scale; bool normalize; - bool add_partial_phone_label_left; - bool add_partial_phone_label_right; - bool add_partial_unk_label_left; - bool add_partial_unk_label_right; - int32 unk_phone; bool convert_to_unconstrained; bool debug; SupervisionLatticeSplitterOptions(): acoustic_scale(1.0), normalize(true), - add_partial_phone_label_left(false), - add_partial_phone_label_right(false), - add_partial_unk_label_left(false), - add_partial_unk_label_right(false), - unk_phone(0), convert_to_unconstrained(false), debug(false) { } void Register(OptionsItf *opts) { @@ -58,24 +48,6 @@ struct SupervisionLatticeSplitterOptions { opts->Register("normalize", &normalize, "Normalize the initial and final scores added to split " "lattices"); - opts->Register("add-partial-phone-label-left", - &add_partial_phone_label_left, - "Add a phone label to account for partial phone transitions " - "in the left split lattices"); - opts->Register("add-partial-phone-label-right", - &add_partial_phone_label_right, - "Add a phone label to account for partial phone transitions " - "in the right split lattices"); - opts->Register("add-partial-unk-label-left", - &add_partial_unk_label_left, - "Add an UNK phone to account for partial phone transitions " - "in the left split lattices"); - opts->Register("add-partial-unk-label-right", - &add_partial_unk_label_right, - "Add an UNK phone to account for partial phone transitions " - "in the right split lattices"); - opts->Register("unk-phone", &unk_phone, - "UNK phone is added at half transition"); opts->Register("convert-to-unconstrained", &convert_to_unconstrained, "If this is true, then self-loop transitions in the " "supervision are replaced by self-loops"); @@ -159,13 +131,6 @@ class SupervisionLatticeSplitter { fst::StdVectorFst tolerance_fst_; void MakeToleranceEnforcerFst(); - const int32 incomplete_phone_; // Equal to trans_model_.NumPhones() + 1 - - // Used to remove "incomplete phone" label - // Applicable only when opts_.add_partial_unk_label_left is true. - fst::StdVectorFst filter_fst_; - void MakeFilterFst(); - // Copy of the lattice loaded using LoadLattice(). // This is required because the lattice states // need to be ordered in breadth-first search order. @@ -174,6 +139,8 @@ class SupervisionLatticeSplitter { // LatticeInfo object for lattice. // This will be computed when PrepareLattice function is called. LatticeInfo lat_scores_; + + fst::StdVectorFst den_fst_; }; void GetToleranceEnforcerFst(const SupervisionOptions &opts, const TransitionModel &trans_model, fst::StdVectorFst *tolerance_fst); diff --git a/src/chain/chain-training.cc b/src/chain/chain-training.cc index b8bbeaafbd8..0a3e011120b 100644 --- a/src/chain/chain-training.cc +++ b/src/chain/chain-training.cc @@ -222,6 +222,17 @@ void ComputeChainDenominatorObjfAndDeriv(const ChainTrainingOptions &opts, } } +void ComputeChainNumeratorPost(const Supervision &supervision, + const CuMatrixBase &nnet_output, + CuMatrixBase *numerator_post) { + KALDI_ASSERT(supervision.weight == 1.0); + KALDI_ASSERT(numerator_post->NumRows() == nnet_output.NumRows() && + numerator_post->NumCols() == nnet_output.NumCols()); + NumeratorComputation numerator(supervision, nnet_output); + numerator.Forward(); + numerator_post->SetZero(); + numerator.Backward(1.0, numerator_post); +} void ComputeChainObjfAndDeriv(const ChainTrainingOptions &opts, const DenominatorGraph &den_graph, @@ -408,6 +419,10 @@ void ComputeChainSmbrObjfAndDeriv(const ChainTrainingOptions &opts, } } + if (opts.smbr_use_numerator_post_targets && + supervision.numerator_post_targets.NumRows() > 0) { + supervision.numerator_post_targets.CopyToMat(&numerator_post); + } if (opts.smbr_threshold > 0) { KALDI_ASSERT(opts.smbr_threshold > 1.0 / nnet_output.NumCols()); diff --git a/src/chain/chain-training.h b/src/chain/chain-training.h index f3b08abb774..83a49bdfa09 100644 --- a/src/chain/chain-training.h +++ b/src/chain/chain-training.h @@ -77,6 +77,7 @@ struct ChainTrainingOptions { bool norm_regularize; BaseFloat smbr_leaky_hmm_coefficient; + bool smbr_use_numerator_post_targets; std::string smbr_factors_str, mmi_factors_str, ml_factors_str, kl_factors_str; @@ -86,7 +87,8 @@ struct ChainTrainingOptions { mmi_factor(1.0), ml_factor(0.0), kl_factor(0.0), smbr_factor(0.0), smbr_threshold(0.0), self_kl(false), norm_regularize(false), - smbr_leaky_hmm_coefficient(-1) { } + smbr_leaky_hmm_coefficient(-1), + smbr_use_numerator_post_targets(false) { } void Register(OptionsItf *opts) { opts->Register("l2-regularize", &l2_regularize, "l2 regularization " @@ -143,6 +145,10 @@ struct ChainTrainingOptions { opts->Register("smbr-leaky-hmm-coefficient", &smbr_leaky_hmm_coefficient, "leaky-hmm-coefficient for LF-sMBR training. If not " "provided, will use --leaky-hmm-coefficient instead."); + opts->Register("smbr-use-numerator-post-targets", + &smbr_use_numerator_post_targets, + "Use numerator posterior targets for computing " + "SMBR per-frame accuracies."); } }; @@ -190,6 +196,10 @@ void ComputeChainObjfAndDeriv(const ChainTrainingOptions &opts, CuMatrixBase *nnet_output_deriv, CuMatrix *xent_output_deriv = NULL); +void ComputeChainNumeratorPost(const Supervision &supervision, + const CuMatrixBase &nnet_output, + CuMatrixBase *numerator_post); + /** This function does both the numerator and denominator parts of the 'chain' smbr computation in one call. diff --git a/src/chainbin/Makefile b/src/chainbin/Makefile index 6efc10eadf3..9fbac324a59 100644 --- a/src/chainbin/Makefile +++ b/src/chainbin/Makefile @@ -14,7 +14,8 @@ BINFILES = chain-est-phone-lm chain-get-supervision chain-make-den-fst \ nnet3-chain-e2e-get-egs nnet3-chain-compute-post \ nnet3-chain-split-and-get-egs chain-split-lattices \ nnet3-chain-split-convert-and-get-egs \ - chain-lattice-to-post chain-fst-to-post + chain-lattice-to-post chain-fst-to-post \ + nnet3-chain-compute-numerator-post OBJFILES = diff --git a/src/chainbin/nnet3-chain-compute-numerator-post.cc b/src/chainbin/nnet3-chain-compute-numerator-post.cc new file mode 100644 index 00000000000..3d1b0fb4a00 --- /dev/null +++ b/src/chainbin/nnet3-chain-compute-numerator-post.cc @@ -0,0 +1,161 @@ +// nnet3bin/nnet3-chain-compute-numerator-post.cc + +// Copyright 2018 Vimal Manohar + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include "base/kaldi-common.h" +#include "util/common-utils.h" +#include "nnet3/nnet-chain-diagnostics.h" +#include "nnet3/nnet-utils.h" + +namespace kaldi { +namespace nnet3 { + +void ProcessOutputs(const Nnet &nnet, + const NnetChainExample &eg, NnetComputer *computer, + NnetChainExample *eg_out) { + *eg_out = eg; + + // There will normally be just one output here, named 'output', + // but the code is more general than this. + std::vector::const_iterator iter = eg.outputs.begin(), + end = eg.outputs.end(); + std::vector::iterator out_iter = eg_out->outputs.begin(), + out_end = eg_out->outputs.end(); + for (; iter != end; ++iter, ++out_iter) { + const NnetChainSupervision &sup = *iter; + int32 node_index = nnet.GetNodeIndex(sup.name); + if (node_index < 0 || + !nnet.IsOutputNode(node_index)) + KALDI_ERR << "Network has no output named " << sup.name; + + const CuMatrixBase &nnet_output = computer->GetOutput(sup.name); + + CuMatrix numerator_post( + nnet_output.NumRows(), nnet_output.NumCols(), kUndefined); + chain::ComputeChainNumeratorPost(sup.supervision, + nnet_output, &numerator_post); + + out_iter->supervision.numerator_post_targets = + SparseMatrix(Matrix(numerator_post)); + } +} + +void ComputeNumeratorPost(const NnetComputeProbOptions &nnet_config, + const Nnet &nnet, + CachingOptimizingCompiler *compiler, + const NnetChainExample &eg, + NnetChainExample *eg_out) { + bool need_model_derivative = false, store_component_stats = false, + use_xent_regularization = false, use_xent_derivative = false; + + ComputationRequest request; + GetChainComputationRequest(nnet, eg, need_model_derivative, + store_component_stats, use_xent_regularization, + use_xent_derivative, &request); + + std::shared_ptr computation = compiler->Compile(request); + NnetComputer computer(nnet_config.compute_config, *computation, + nnet, NULL); + // give the inputs to the computer object. + computer.AcceptInputs(nnet, eg.inputs); + computer.Run(); + ProcessOutputs(nnet, eg, &computer, eg_out); +} + +} // namespace nnet3 +} // namespace kaldi + +int main(int argc, char *argv[]) { + try { + using namespace kaldi; + using namespace kaldi::nnet3; + typedef kaldi::int32 int32; + typedef kaldi::int64 int64; + + const char *usage = + "Computes the numerator posteriors per frame of the given data with \n" + "an nnet3+chain neural net and outputs egs that include those \n" + "numerator posteriors. The input of this is the output of\n" + "e.g. nnet3-chain-get-egs |\n" + "\n" + "Usage: nnet3-chain-compute-numerator-post [options] \n" + "e.g.: nnet3-chain-compute-numerator-post 0.mdl ark:cegs.1.ark ark:cegs_out.1.ark\n"; + + bool batchnorm_test_mode = true, dropout_test_mode = true; + + // This program doesn't support using a GPU, because these probabilities are + // used for diagnostics, and you can just compute them with a small enough + // amount of data that a CPU can do it within reasonable time. + // It wouldn't be hard to make it support GPU, though. + + NnetComputeProbOptions nnet_opts; + + ParseOptions po(usage); + + po.Register("batchnorm-test-mode", &batchnorm_test_mode, + "If true, set test-mode to true on any BatchNormComponents."); + po.Register("dropout-test-mode", &dropout_test_mode, + "If true, set test-mode to true on any DropoutComponents and " + "DropoutMaskComponents."); + + nnet_opts.Register(&po); + + po.Read(argc, argv); + + if (po.NumArgs() != 3) { + po.PrintUsage(); + exit(1); + } + + nnet_opts.compute_deriv = false; + + std::string nnet_rxfilename = po.GetArg(1), + examples_rspecifier = po.GetArg(2), + examples_wspecifier = po.GetArg(3); + + Nnet nnet; + ReadKaldiObject(nnet_rxfilename, &nnet); + + if (batchnorm_test_mode) + SetBatchnormTestMode(true, &nnet); + + if (dropout_test_mode) + SetDropoutTestMode(true, &nnet); + + SequentialNnetChainExampleReader example_reader(examples_rspecifier); + NnetChainExampleWriter example_writer(examples_wspecifier); + + CachingOptimizingCompiler compiler(nnet, nnet_opts.optimize_config, + nnet_opts.compiler_config); + + int32 num_done = 0; + for (; !example_reader.Done(); example_reader.Next()) { + NnetChainExample eg_out; + ComputeNumeratorPost(nnet_opts, nnet, &compiler, + example_reader.Value(), &eg_out); + example_writer.Write(example_reader.Key(), eg_out); + num_done++; + } + + return (num_done > 0 ? 0 : 1); + } catch(const std::exception &e) { + std::cerr << e.what() << '\n'; + return -1; + } +} + diff --git a/src/chainbin/nnet3-chain-split-and-get-egs.cc b/src/chainbin/nnet3-chain-split-and-get-egs.cc index 86c1b1868a0..cc70af662d1 100644 --- a/src/chainbin/nnet3-chain-split-and-get-egs.cc +++ b/src/chainbin/nnet3-chain-split-and-get-egs.cc @@ -41,6 +41,7 @@ namespace nnet3 { static bool ProcessFile(const chain::SupervisionOptions &sup_opts, const fst::StdVectorFst &normalization_fst, + const fst::StdVectorFst &den_fst, const GeneralMatrix &feats, const MatrixBase *ivector_feats, int32 ivector_period, @@ -273,6 +274,7 @@ int main(int argc, char *argv[]) { int32 srand_seed = 0; std::string online_ivector_rspecifier, deriv_weights_rspecifier, graph_posterior_rspecifier; + std::string den_fst_rxfilename; BaseFloat min_post = 1e-8; bool add_numerator_post = false; @@ -307,6 +309,9 @@ int main(int argc, char *argv[]) { po.Register("add-numerator-post", &add_numerator_post, "Add numerator post to supervision; this is alternative to " "graph-posterior-rspecifier"); + po.Register("den-fst", &den_fst_rxfilename, + "If provided, will compose this with the lattice " + "before splitting."); eg_config.Register(&po); @@ -359,7 +364,7 @@ int main(int argc, char *argv[]) { KALDI_ASSERT(normalization_fst.NumStates() > 0); if (sup_opts.lm_scale < 0.0 || sup_opts.lm_scale > 1.0) { - KALDI_ERR << "Invalid lm-scale; must be in [0.0, 1.0)"; + KALDI_ERR << "Invalid lm-scale; must be in [0.0, 1.0]"; } if (sup_opts.lm_scale != 0.0) { @@ -367,6 +372,24 @@ int main(int argc, char *argv[]) { } } + fst::StdVectorFst den_fst; + if (!den_fst_rxfilename.empty()) { + KALDI_LOG << "Adding weights from denominator FST before splitting."; + + normalization_fst = den_fst; // clear normalization FST + + ReadFstKaldi(den_fst_rxfilename, &den_fst); + KALDI_ASSERT(den_fst.NumStates() > 0); + + if (sup_opts.lm_scale < 0.0 || sup_opts.lm_scale >= 1.0) { + KALDI_ERR << "Invalid lm-scale; must be in [0.0, 1.0]"; + } + + if (sup_opts.lm_scale != 0.0) { + fst::ApplyProbabilityScale(1.0 - sup_opts.lm_scale, &den_fst); + } + } + // Read as GeneralMatrix so we don't need to un-compress and re-compress // when selecting parts of matrices. SequentialGeneralMatrixReader feat_reader(feature_rspecifier); @@ -454,7 +477,7 @@ int main(int argc, char *argv[]) { sup_lat_splitter.LoadLattice(lat); - if (!ProcessFile(sup_opts, normalization_fst, feats, + if (!ProcessFile(sup_opts, normalization_fst, den_fst, feats, online_ivector_feats, online_ivector_period, trans_model, sup_lat_splitter, deriv_weights, graph_posteriors, min_post, From 029eeed1db17809fd4daeb8d6c73cf3d44d3fd03 Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Wed, 22 Aug 2018 17:14:36 -0400 Subject: [PATCH 169/174] Bug fix in combination objective --- src/nnet3/nnet-chain-diagnostics.cc | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/src/nnet3/nnet-chain-diagnostics.cc b/src/nnet3/nnet-chain-diagnostics.cc index d6ceb93db5d..112b51674c6 100644 --- a/src/nnet3/nnet-chain-diagnostics.cc +++ b/src/nnet3/nnet-chain-diagnostics.cc @@ -290,7 +290,6 @@ void NnetChainComputeProb::ProcessOutputs(const NnetChainExample &eg, // computation. note, xent_deriv has a factor of '.supervision.weight', // but so does tot_weight. BaseFloat xent_objf = TraceMatMat(xent_output, xent_deriv, kTrans); - xent_totals.tot_weight += tot_weight; xent_totals.tot_like += xent_objf; } @@ -298,7 +297,6 @@ void NnetChainComputeProb::ProcessOutputs(const NnetChainExample &eg, } } - bool NnetChainComputeProb::PrintTotalStats() const { bool ans = false; unordered_map::const_iterator @@ -351,22 +349,17 @@ const ChainObjectiveInfo* NnetChainComputeProb::GetObjective( } double NnetChainComputeProb::GetTotalObjective(double *total_weight) const { + double tot_objectives = 0.0; + double tot_weight = 0.0; unordered_map::const_iterator - iter, end; - iter = objf_info_.begin(); - end = objf_info_.end(); - BaseFloat tot_objf = 0.0, tot_weight = 0.0; + iter = objf_info_.begin(), end = objf_info_.end(); for (; iter != end; ++iter) { - const ChainObjectiveInfo &info = iter->second; - BaseFloat like = (info.tot_like / info.tot_weight); - ObjectiveValues aux_objfs(info.tot_aux_objfs); - aux_objfs.Scale(info.tot_weight); - tot_objf += like + aux_objfs.Sum(); - tot_weight += info.tot_weight; + tot_objf += iter->second.tot_like + iter->second.aux_objfs.Sum(); + tot_weight += iter->second.tot_weight; } - if(total_weight) *total_weight = tot_weight; - return tot_objf; + if (total_weight) *total_weight = tot_weight; + return tot_objectives; } static bool HasXentOutputs(const Nnet &nnet) { From d0de66186b1132aa5337386aae06b9b1343e94dc Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Wed, 22 Aug 2018 17:16:57 -0400 Subject: [PATCH 170/174] Minor fix --- src/nnet3/nnet-chain-diagnostics.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/nnet3/nnet-chain-diagnostics.cc b/src/nnet3/nnet-chain-diagnostics.cc index 112b51674c6..00bca26b98c 100644 --- a/src/nnet3/nnet-chain-diagnostics.cc +++ b/src/nnet3/nnet-chain-diagnostics.cc @@ -354,7 +354,7 @@ double NnetChainComputeProb::GetTotalObjective(double *total_weight) const { unordered_map::const_iterator iter = objf_info_.begin(), end = objf_info_.end(); for (; iter != end; ++iter) { - tot_objf += iter->second.tot_like + iter->second.aux_objfs.Sum(); + tot_objectives += iter->second.tot_like + iter->second.aux_objfs.Sum(); tot_weight += iter->second.tot_weight; } From 59aa912360f438c0f08784fc04e8a304fd095541 Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Wed, 22 Aug 2018 17:26:47 -0400 Subject: [PATCH 171/174] Minor fix --- src/nnet3/nnet-chain-diagnostics.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/nnet3/nnet-chain-diagnostics.cc b/src/nnet3/nnet-chain-diagnostics.cc index 00bca26b98c..0d8f5d5b961 100644 --- a/src/nnet3/nnet-chain-diagnostics.cc +++ b/src/nnet3/nnet-chain-diagnostics.cc @@ -354,7 +354,7 @@ double NnetChainComputeProb::GetTotalObjective(double *total_weight) const { unordered_map::const_iterator iter = objf_info_.begin(), end = objf_info_.end(); for (; iter != end; ++iter) { - tot_objectives += iter->second.tot_like + iter->second.aux_objfs.Sum(); + tot_objectives += iter->second.tot_like + iter->second.tot_aux_objfs.Sum(); tot_weight += iter->second.tot_weight; } From 06cf1762f20d2e90e0332e7fc1ab502312d9dc06 Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Fri, 24 Aug 2018 13:16:48 -0400 Subject: [PATCH 172/174] Updating babel recipe --- .../s5d/local/chain/run_ivector_common.sh | 93 ++-- .../local/chain/tuning/run_tdnn_lstm_bab7.sh | 18 +- .../local/chain/tuning/run_tdnn_lstm_bab9.sh | 234 ++++++++++ .../chain/tuning/run_tdnn_lstm_semisup_1a.sh | 423 ++++++++++++++++++ .../local/datasets/unsupervised_asr_seg.sh | 93 ++++ egs/babel/s5d/local/run_asr_segmentation.sh | 1 + egs/babel/s5d/run-4-anydecode.sh | 10 +- 7 files changed, 819 insertions(+), 53 deletions(-) create mode 100755 egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab9.sh create mode 100755 egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_semisup_1a.sh create mode 100644 egs/babel/s5d/local/datasets/unsupervised_asr_seg.sh diff --git a/egs/babel/s5d/local/chain/run_ivector_common.sh b/egs/babel/s5d/local/chain/run_ivector_common.sh index a1a145564d0..060c775d5d5 100755 --- a/egs/babel/s5d/local/chain/run_ivector_common.sh +++ b/egs/babel/s5d/local/chain/run_ivector_common.sh @@ -16,9 +16,10 @@ gmm=tri5_cleaned # This specifies a GMM-dir from the features # of the type you're training the system on; # it should contain alignments for 'train_set'. langdir=data/langp/tri5_ali - +generate_alignments=true num_threads_ubm=12 nnet3_affix=_cleaned +extractor= . ./cmd.sh . ./path.sh @@ -57,7 +58,7 @@ if [ $stage -le 1 ]; then utils/fix_data_dir.sh data/${train_set}_sp fi -if [ $stage -le 2 ]; then +if $generate_alignments && [ $stage -le 2 ]; then echo "$0: aligning with the perturbed low-resolution data" steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \ data/${train_set}_sp data/lang $gmm_dir $ali_dir || exit 1 @@ -93,53 +94,55 @@ if [ $stage -le 3 ]; then steps/compute_cmvn_stats.sh \ data/${datadir}_hires_nopitch exp/make_hires/${datadir}_nopitch $mfccdir || exit 1; utils/fix_data_dir.sh data/${datadir}_hires_nopitch - done fi -if [ $stage -le 4 ]; then - echo "$0: computing a subset of data to train the diagonal UBM." - - mkdir -p exp/nnet3${nnet3_affix}/diag_ubm - temp_data_root=exp/nnet3${nnet3_affix}/diag_ubm - - # train a diagonal UBM using a subset of about a quarter of the data - # we don't use the _comb data for this as there is no need for compatibility with - # the alignments, and using the non-combined data is more efficient for I/O - # (no messing about with piped commands). - num_utts_total=$(wc -l 3273 combine=-0.204->-0.179 +# xent:train/valid[31,47,final]=(-2.35,-1.89,-1.86/-2.49,-2.19,-2.17) +# logprob:train/valid[31,47,final]=(-0.199,-0.158,-0.154/-0.236,-0.221,-0.222) +# 206-zulu | %WER 52.2 | 22805 52162 | 51.6 38.2 10.2 3.8 52.2 30.7 | -0.629 | exp/chain_cleaned/tdnn_lstm_bab7_sp/decode_dev10h.pem/score_11/dev10h.pem.ctm.sys +# num-iters=66 nj=2..12 num-params=36.7M dim=43+100->3274 combine=-0.237->-0.215 +# xent:train/valid[43,65,final]=(-2.42,-1.96,-1.94/-2.53,-2.25,-2.24) +# logprob:train/valid[43,65,final]=(-0.239,-0.188,-0.186/-0.279,-0.267,-0.266) +# 104-pashto | %WER 40.2 | 21825 101803 | 63.8 25.8 10.4 3.9 40.2 29.8 | -0.438 | exp/chain_cleaned/tdnn_lstm_bab7_sp/decode_dev10h.pem/score_10/dev10h.pem.ctm.sys +# num-iters=85 nj=2..12 num-params=36.8M dim=43+100->3328 combine=-0.203->-0.189 +# xent:train/valid[55,84,final]=(-2.27,-1.81,-1.79/-2.46,-2.18,-2.17) +# logprob:train/valid[55,84,final]=(-0.213,-0.166,-0.163/-0.264,-0.249,-0.250) + +set -e -o pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +nj=30 +dropout_schedule='0,0@0.20,0.3@0.50,0' +train_set=train_cleaned +gmm=tri5_cleaned # the gmm for the target data +langdir=data/langp/tri5_ali +num_threads_ubm=12 +nnet3_affix=_cleaned # cleanup affix for nnet3 and chain dirs, e.g. _cleaned +num_epochs=4 + +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +train_stage=-10 +tree_affix= # affix for tree directory, e.g. "a" or "b", in case we change the configuration. +tdnn_affix="_bab9" #affix for TDNN directory, e.g. "a" or "b", in case we change the configuration. +common_egs_dir= # you can set this to use previously dumped egs. +chunk_width=150,120,90,75 +chunk_left_context=40 + +hidden_dim=512 +cell_dim=512 +projection_dim=128 + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <data/lang_chain/topo + fi +fi + +if [ $stage -le 15 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \ + $langdir $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 16 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --leftmost-questions-truncate -1 \ + --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir +fi + +xent_regularize=0.1 +if [ $stage -le 17 ]; then + mkdir -p $dir + + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; } + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + lstm_opts="decay-time=20 dropout-proportion=0.0" + label_delay=5 + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=43 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults + fast-lstmp-layer name=fastlstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=fastlstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=fastlstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=fastlstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=fastlstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ + +fi + +if [ $stage -le 18 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/babel-$(date +'%m_%d_%H_%M')/s5d/$RANDOM/$dir/egs/storage $dir/egs/storage + fi + [ ! -d $dir/egs ] && mkdir -p $dir/egs/ + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.dir "$common_egs_dir" \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context 0 \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $chunk_width \ + --trainer.num-chunk-per-minibatch 128,64 \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial 2 \ + --trainer.optimization.num-jobs-final 12 \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs true \ + --feat-dir $train_data_dir \ + --tree-dir $tree_dir \ + --lat-dir $lat_dir \ + --dir $dir +fi + + + +if [ $stage -le 19 ]; then + # Note: it might appear that this data/lang_chain directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/langp_test $dir $dir/graph +fi + +exit 0 diff --git a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_semisup_1a.sh b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_semisup_1a.sh new file mode 100755 index 00000000000..5a509a1d14a --- /dev/null +++ b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_semisup_1a.sh @@ -0,0 +1,423 @@ +#!/bin/bash + +# Unsupervised set: train_unsup100k_500k +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervises): 3,2 +# LM for decoding unsupervised data: 4gram + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=80 + +supervised_set=train_cleaned +unsupervised_set=train_unt.asr_seg_1a + +srcdir=exp/chain_cleaned/tdnn_lstm_bab9_2_nepochs10_h512_sp +treedir=exp/chain_cleaned/tree +src_extractor=exp/nnet3_cleaned/extractor +sup_lat_dir=exp/chain_cleaned/tri5_cleaned_train_cleaned_sp_lats + +nnet3_affix=_cleaned_semisup +chain_affix=_cleaned_semisup + +frames_per_eg=150,120,90,75 + +# Unsupervised options +unsup_frames_per_eg=150 # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=4.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=1 +phone_insertion_penalty= + +smbr_leaky_hmm_coefficient=0.00001 +mmi_factor_schedule="output-0=1.0,1.0 output-1=1.0,1.0" +smbr_factor_schedule="output-0=0.0,0.0 output-1=0.0,0.0" + +# Semi-supervised options +affix=_semisup_1a # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +chain_smbr_extra_opts="--one-silence-class" +lm_weights=3,1 +num_copies=2,1 +sup_egs_dir= +unsup_egs_dir= +unsup_egs_opts= + +remove_egs=false +common_egs_dir= + +hidden_dim=1024 +cell_dim=1024 +projection_dim=256 + +apply_deriv_weights=true +use_smart_splitting=true + +# training options +num_epochs=2 +minibatch_size=64,32 +chunk_left_context=40 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.025 +self_repair_scale=0.00001 +label_delay=5 + +num_threads_ubm=12 +# decode options +extra_left_context=50 +extra_right_context=0 + +decode_iter= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat < $srcdir/best_path_${unsupervised_set}_sp/frame_subsampling_factor +fi + +cmvn_opts=`cat $srcdir/cmvn_opts` || exit 1 + +sup_ali_dir=exp/tri5_cleaned + +diff $treedir/tree $srcdir/tree || { echo "$0: $treedir/tree and $srcdir/tree differ"; exit 1; } + +dir=exp/chain${chain_affix}/tdnn_lstm${affix}_sp + +if [ $stage -le 14 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${treedir} ${srcdir}/best_path_${unsupervised_set}_sp \ + $dir +fi + +if [ $stage -le 15 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=43 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm4 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + output-layer name=output input=lstm4 output-delay=$label_delay dim=$num_targets include-log-softmax=false max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + output name=output-0 input=output.affine@$label_delay + output name=output-1 input=output.affine@$label_delay + + output name=output-0-xent input=output-xent.log-softmax@$label_delay + output name=output-1-xent input=output-xent.log-softmax@$label_delay +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +. $dir/configs/vars + +left_context=$[model_left_context + chunk_left_context] +right_context=$[model_right_context + chunk_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +egs_left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +egs_right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +egs_left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +egs_right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + + if [ $stage -le 16 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${srcdir}/decode_${unsupervised_set} + +if [ -z "$unsup_egs_dir" ]; then + [ -z $unsup_frames_per_eg ] && [ ! -z "$frames_per_eg" ] && unsup_frames_per_eg=$frames_per_eg + unsup_egs_dir=$dir/egs_${unsupervised_set} + + if [ $stage -le 17 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + if $use_smart_splitting; then + get_egs_script=steps/nnet3/chain/get_egs_split.sh + else + get_egs_script=steps/nnet3/chain/get_egs.sh + fi + + $get_egs_script --cmd "$decode_cmd --h-rt 100:00:00" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --deriv-weights-scp $srcdir/best_path_${unsupervised_set}/weights.scp \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/comb_egs + +if [ $stage -le 18 ]; then + steps/nnet3/chain/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --block-size 128 \ + --lang2weight $supervision_weights --lang2num-copies "$num_copies" \ + 2 $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 19 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights $apply_deriv_weights \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --chain.smbr-leaky-hmm-coefficient $smbr_leaky_hmm_coefficient \ + --chain.mmi-factor-schedule="$mmi_factor_schedule" \ + --chain.smbr-factor-schedule="$smbr_factor_schedule" \ + --chain.smbr-extra-opts="$chain_smbr_extra_opts" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch "$minibatch_size" \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir --lang data/lang_chain || exit 1; +fi + +graph_dir=$dir/graph${test_graph_affix} +if [ $stage -le 20 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 $test_lang $dir $graph_dir +fi + +wait; +exit 0; diff --git a/egs/babel/s5d/local/datasets/unsupervised_asr_seg.sh b/egs/babel/s5d/local/datasets/unsupervised_asr_seg.sh new file mode 100644 index 00000000000..0b84e94a15e --- /dev/null +++ b/egs/babel/s5d/local/datasets/unsupervised_asr_seg.sh @@ -0,0 +1,93 @@ +#This script is not really supposed to be run directly +#Instead, it should be sourced from the decoding script +#It makes many assumption on existence of certain environmental +#variables as well as certain directory structure. +if [ ${dataset_type} != "supervised" ] ; then + mandatory_variables="my_data_dir my_data_list my_nj" + optional_variables="" +else + mandatory_variables="my_data_dir my_data_list my_nj" + optional_variables="my_stm_file" +fi + +check_variables_are_set + +decode_opts="--extra-left-context 70 --extra-right-context 0 --frames-per-chunk 150 --extra-left-context-initial 0 --extra-right-context-final 0" +sad_nnet_dir=exp/segmentation_1a/tdnn_lstm_asr_sad_1a + +workdir=exp/make_seg/${dataset_id} +unseg_dir=$workdir +mkdir -p $unseg_dir +# 4. Create the wav.scp file: +sph2pipe=`which sph2pipe || which $KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe` +if [ $? -ne 0 ] ; then + echo "Could not find sph2pipe binary. Add it to PATH" + exit 1; +fi +sox=`which sox` +if [ $? -ne 0 ] ; then + echo "Could not find sox binary. Add it to PATH" + exit 1; +fi + +echo "Creating the $unseg_dir/wav.scp file" +audiodir=$my_data_dir/audio +for file in `cat $my_data_list | sort -u` ; do + if [ -f $audiodir/$file.sph ] ; then + echo "$file $sph2pipe -f wav -p -c 1 $audiodir/$file.sph |" + elif [ -f $audiodir/$file.wav ] ; then + echo "$file $sox $audiodir/$file.wav -r 8000 -c 1 -b 16 -t wav - downsample |" + else + echo "Audio file $audiodir/$file.(sph|wav) does not exist!" >&2 + exit 1 + fi +done | sort -u > $unseg_dir/wav.scp + +l1=`cat $unseg_dir/wav.scp | wc -l ` +l2=`cat $my_data_list | wc -l ` +if [ "$l1" -ne "$l2" ] ; then + echo "wav.scp number of files: $l1" + echo "filelist number of files: $l2" + echo "Not all files from the list $my_data_list found their way into wav.scp" + exit 1 +fi + +echo "Creating the $unseg_dir/reco2file_and_channel file" +cat $unseg_dir/wav.scp | awk '{print $1, $1, "A";}' > $unseg_dir/reco2file_and_channel +cat $unseg_dir/wav.scp | awk '{print $1, $1;}' > $unseg_dir/utt2spk +utils/utt2spk_to_spk2utt.pl $unseg_dir/utt2spk > $unseg_dir/spk2utt + +steps/segmentation/detect_speech_activity.sh \ + $decode_opts \ + --nj $my_nj --acwt 0.3 \ + --mfcc-config conf/mfcc_hires_bp.conf \ + $unseg_dir \ + $sad_nnet_dir mfcc_hires_bp \ + $sad_nnet_dir $sad_nnet_dir/${dataset_id} + +utils/copy_data_dir.sh $sad_nnet_dir/${dataset_id}_seg $dataset_dir + +num_hours=`cat ${dataset_dir}/segments | \ + awk '{secs+= $4-$3;} END{print(secs/3600);}'` + +echo "Number of hours of the newly segmented data: $num_hours" + +if [ "$dataset_kind" == "supervised" ]; then + echo --------------------------------------------------------------------- + echo "preparing ${dataset_id} stm files in ${dataset_dir} on" `date` + echo --------------------------------------------------------------------- + if [ ! -z $my_stm_file ] ; then + local/augment_original_stm.pl $my_stm_file ${dataset_dir} + else + local/prepare_stm.pl --fragmentmarkers \-\*\~ ${dataset_dir} + fi +else + echo --------------------------------------------------------------------- + echo "preparing ${dataset_id} stm files in ${dataset_dir} on" `date` + echo --------------------------------------------------------------------- + if [ ! -z $my_stm_file ] ; then + local/augment_original_stm.pl $my_stm_file ${dataset_dir} + fi +fi + + diff --git a/egs/babel/s5d/local/run_asr_segmentation.sh b/egs/babel/s5d/local/run_asr_segmentation.sh index f70775526b6..40da4a4b50d 100755 --- a/egs/babel/s5d/local/run_asr_segmentation.sh +++ b/egs/babel/s5d/local/run_asr_segmentation.sh @@ -148,6 +148,7 @@ if [ $stage -le 6 ]; then --extra-left-context 70 --extra-right-context 0 --frames-per-chunk 150 \ --extra-left-context-initial 0 --extra-right-context-final 0 \ --nj $test_nj --acwt 0.3 --stage $test_stage \ + --mfcc-config conf/mfcc_hires_bp.conf \ data/dev10h.pem \ exp/segmentation_1a/tdnn_lstm_asr_sad_1a \ mfcc_hires_bp \ diff --git a/egs/babel/s5d/run-4-anydecode.sh b/egs/babel/s5d/run-4-anydecode.sh index 52c997ae26a..18a6412ccf7 100755 --- a/egs/babel/s5d/run-4-anydecode.sh +++ b/egs/babel/s5d/run-4-anydecode.sh @@ -242,7 +242,9 @@ if [ ! -f $dataset_dir/.done ] ; then elif [ "$dataset_kind" == "unsupervised" ] ; then if [ "$dataset_segments" == "seg" ]; then . ./local/datasets/unsupervised_seg.sh - elif [[ $dataset_segments =~ *seg* ]]; then + elif [[ $dataset_segments =~ asr_seg* ]]; then + . ./local/datasets/unsupervised_asr_seg.sh + elif [[ $dataset_segments =~ seg* ]]; then . ./local/datasets/unsupervised_seg.sh elif [ "$dataset_segments" == "uem" ] ; then . ./local/datasets/unsupervised_uem.sh @@ -555,14 +557,16 @@ if [ -f exp/$chain_model/final.mdl ]; then my_nj_backup=$my_nj rnn_opts= if [ "$is_rnn" == "true" ]; then - rnn_opts=" --extra-left-context $extra_left_context --extra-right-context $extra_right_context --frames-per-chunk $frames_per_chunk " + rnn_opts=" --extra-left-context $extra_left_context --extra-right-context $extra_right_context --frames-per-chunk $frames_per_chunk --extra-left-context-initial 0 --extra-right-context-final 0" echo "Modifying the number of jobs as this is an RNN and decoding can be extremely slow." my_nj=`cat ${dataset_dir}_hires/spk2utt|wc -l` + if [ $my_nj -gt $my_nj_backup ]; then + my_nj=$my_nj_backup + fi fi if [ ! -f $decode/.done ]; then mkdir -p $decode echo "Modifying the number of jobs as this is an RNN and decoding can be extremely slow." - my_nj=`cat ${dataset_dir}_hires/spk2utt|wc -l` $decode_script --nj $my_nj --cmd "$decode_cmd" $rnn_opts \ --acwt 1.0 --post-decode-acwt 10.0 \ --beam $dnn_beam --lattice-beam $dnn_lat_beam \ From 059eb67421e1034ffb586bca1bdc7840212f49f0 Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Fri, 24 Aug 2018 13:18:09 -0400 Subject: [PATCH 173/174] semisup:Compose den lat before splitting --- src/chain/chain-supervision-splitter.cc | 32 +++++++++++++++---- src/chain/chain-supervision-splitter.h | 7 ++-- src/chain/chain-supervision.cc | 8 +++-- src/chain/chain-supervision.h | 4 +++ src/chainbin/chain-split-lattices.cc | 3 +- src/chainbin/nnet3-chain-get-egs.cc | 2 +- src/chainbin/nnet3-chain-split-and-get-egs.cc | 22 +++++++++---- 7 files changed, 56 insertions(+), 22 deletions(-) diff --git a/src/chain/chain-supervision-splitter.cc b/src/chain/chain-supervision-splitter.cc index 0f9cf3fb357..fc1ab65e630 100644 --- a/src/chain/chain-supervision-splitter.cc +++ b/src/chain/chain-supervision-splitter.cc @@ -29,6 +29,9 @@ namespace chain { typedef fst::ArcTpl LatticeArc; typedef fst::VectorFst Lattice; +const int kSupervisionMaxStates = 200000; // we can later make this + // configurable if needed. + void FstToLattice(const fst::StdVectorFst &fst, Lattice *lat) { lat->DeleteStates(); @@ -116,8 +119,10 @@ bool LatticeToNumeratorPost(const Lattice &lat, SupervisionLatticeSplitter::SupervisionLatticeSplitter( const SupervisionLatticeSplitterOptions &opts, const SupervisionOptions &sup_opts, - const TransitionModel &trans_model): - sup_opts_(sup_opts), opts_(opts), trans_model_(trans_model) { + const TransitionModel &trans_model, + const fst::StdVectorFst &den_fst): + sup_opts_(sup_opts), opts_(opts), trans_model_(trans_model), + den_fst_(den_fst) { if (opts_.convert_to_unconstrained) { KALDI_WARN << "--convert-to-unconstrained=true; " @@ -127,10 +132,11 @@ SupervisionLatticeSplitter::SupervisionLatticeSplitter( } } -void SupervisionLatticeSplitter::LoadLattice(const Lattice &lat) { +bool SupervisionLatticeSplitter::LoadLattice(const Lattice &lat) { lat_ = lat; - PrepareLattice(); + if (!PrepareLattice()) + return false; int32 num_states = lat_.NumStates(); @@ -142,6 +148,7 @@ void SupervisionLatticeSplitter::LoadLattice(const Lattice &lat) { KALDI_ASSERT(num_states == lat_scores_.state_times.size()); KALDI_ASSERT(lat_scores_.state_times[start_state] == 0); + return true; } bool SupervisionLatticeSplitter::GetFrameRangeSupervision( @@ -221,7 +228,7 @@ void SupervisionLatticeSplitter::LatticeInfo::Check() const { KALDI_ASSERT(state_times.back() == num_frames); } -void SupervisionLatticeSplitter::PrepareLattice() { +bool SupervisionLatticeSplitter::PrepareLattice() { // Scale the lattice to appropriate acoustic scale. KALDI_ASSERT(opts_.acoustic_scale != 0.0); if (opts_.acoustic_scale != 1.0) @@ -242,6 +249,9 @@ void SupervisionLatticeSplitter::PrepareLattice() { fst::Compose(lat_out, den_lat, &lat_); // In lat_, ilabel is transition-id, olabel is pdf-id+1 + + if (lat_.NumStates() == 0) + return false; } KALDI_ASSERT(fst::TopSort(&lat_)); @@ -263,10 +273,12 @@ void SupervisionLatticeSplitter::PrepareLattice() { fst::StateSort(&lat_, state_order); ComputeLatticeScores(); + + return true; } void SupervisionLatticeSplitter::CreateRangeLattice( - int32 begin_frame, int32 end_frame, + int32 begin_frame, int32 end_frame, Lattice *out_lat) const { typedef Lattice::StateId StateId; typedef LatticeArc::Label Label; @@ -462,6 +474,11 @@ bool SupervisionLatticeSplitter::GetSupervision( fst::RmEpsilon(&(supervision->fst)); fst::DeterminizeInLog(&(supervision->fst)); + if (den_fst_.NumStates() > 0) { + TryDeterminizeMinimize(kSupervisionMaxStates, + &(supervision->fst)); + } + if (opts_.debug) { std::cerr << "tolerance added fst"; fst::WriteFstKaldi(std::cerr, false, supervision->fst); @@ -478,7 +495,8 @@ bool SupervisionLatticeSplitter::GetSupervision( supervision->weight = 1.0; supervision->num_sequences = 1; supervision->label_dim = trans_model_.NumPdfs(); - SortBreadthFirstSearch(&(supervision->fst)); + if (!opts_.convert_to_unconstrained) + SortBreadthFirstSearch(&(supervision->fst)); return true; } diff --git a/src/chain/chain-supervision-splitter.h b/src/chain/chain-supervision-splitter.h index b101e97a30a..6d74fc1a8ef 100644 --- a/src/chain/chain-supervision-splitter.h +++ b/src/chain/chain-supervision-splitter.h @@ -60,9 +60,10 @@ class SupervisionLatticeSplitter { public: SupervisionLatticeSplitter(const SupervisionLatticeSplitterOptions &opts, const SupervisionOptions &sup_opts, - const TransitionModel &trans_model); + const TransitionModel &trans_model, + const fst::StdVectorFst &den_fst); - void LoadLattice(const Lattice &lat); + bool LoadLattice(const Lattice &lat); bool GetFrameRangeSupervision(int32 begin_frame, int32 frames_per_sequence, chain::Supervision *supervision, @@ -120,7 +121,7 @@ class SupervisionLatticeSplitter { // 1) Order states in breadth-first search order // 2) Compute states times, which must be a strictly non-decreasing vector // 3) Compute lattice alpha and beta scores - void PrepareLattice(); + bool PrepareLattice(); const SupervisionOptions &sup_opts_; diff --git a/src/chain/chain-supervision.cc b/src/chain/chain-supervision.cc index 02f428ccc23..656d1ada433 100644 --- a/src/chain/chain-supervision.cc +++ b/src/chain/chain-supervision.cc @@ -641,9 +641,11 @@ void Supervision::Read(std::istream &is, bool binary) { ReadBasicType(is, binary, &frames_per_sequence); ExpectToken(is, binary, ""); ReadBasicType(is, binary, &label_dim); - bool e2e; - ExpectToken(is, binary, ""); - ReadBasicType(is, binary, &e2e); + bool e2e = false; + if (PeekToken(is, binary) == 'E') { + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &e2e); + } if (!e2e) { if (PeekToken(is, binary) == 'N') { ExpectToken(is, binary, ""); diff --git a/src/chain/chain-supervision.h b/src/chain/chain-supervision.h index 2cd5c40763e..83c5f782cdb 100644 --- a/src/chain/chain-supervision.h +++ b/src/chain/chain-supervision.h @@ -101,6 +101,10 @@ struct SupervisionOptions { }; +bool TryDeterminizeMinimize(int32 supervision_max_states, + fst::StdVectorFst *supervision_fst); + + // This is the form that the supervision information for 'chain' models takes // we compile it to Supervision. // The normal compilation sequence is: diff --git a/src/chainbin/chain-split-lattices.cc b/src/chainbin/chain-split-lattices.cc index 2e4c3232a25..32fc54345a7 100644 --- a/src/chainbin/chain-split-lattices.cc +++ b/src/chainbin/chain-split-lattices.cc @@ -148,8 +148,9 @@ int main(int argc, char *argv[]) { int32 num_err = 0; + fst::StdVectorFst den_fst; chain::SupervisionLatticeSplitter sup_lat_splitter( - sup_lat_splitter_opts, sup_opts, trans_model); + sup_lat_splitter_opts, sup_opts, trans_model, den_fst); for (; !lattice_reader.Done(); lattice_reader.Next()) { std::string key = lattice_reader.Key(); diff --git a/src/chainbin/nnet3-chain-get-egs.cc b/src/chainbin/nnet3-chain-get-egs.cc index f38459b6270..7a7ccb364ee 100644 --- a/src/chainbin/nnet3-chain-get-egs.cc +++ b/src/chainbin/nnet3-chain-get-egs.cc @@ -161,7 +161,7 @@ static bool ProcessFile(const TransitionModel *trans_mdl, << (chunk.first_frame + chunk.num_frames) << ", FST was empty after composing with normalization FST. " << "This should be extremely rare (a few per corpus, at most)"; - return false; + continue; } int32 first_frame = 0; // we shift the time-indexes of all these parts so diff --git a/src/chainbin/nnet3-chain-split-and-get-egs.cc b/src/chainbin/nnet3-chain-split-and-get-egs.cc index cc70af662d1..c5c14b45679 100644 --- a/src/chainbin/nnet3-chain-split-and-get-egs.cc +++ b/src/chainbin/nnet3-chain-split-and-get-egs.cc @@ -41,7 +41,6 @@ namespace nnet3 { static bool ProcessFile(const chain::SupervisionOptions &sup_opts, const fst::StdVectorFst &normalization_fst, - const fst::StdVectorFst &den_fst, const GeneralMatrix &feats, const MatrixBase *ivector_feats, int32 ivector_period, @@ -160,7 +159,7 @@ static bool ProcessFile(const chain::SupervisionOptions &sup_opts, << (chunk.first_frame + chunk.num_frames) << ", FST was empty after composing with normalization FST. " << "This should be extremely rare (a few per corpus, at most)"; - return false; + continue; } int32 first_frame = 0; // we shift the time-indexes of all these parts so @@ -356,7 +355,8 @@ int main(int argc, char *argv[]) { UtteranceSplitter utt_splitter(eg_config); if (add_numerator_post) - KALDI_ASSERT(!normalization_fst_rxfilename.empty()); + KALDI_ASSERT(!normalization_fst_rxfilename.empty() || + !den_fst_rxfilename.empty()); fst::StdVectorFst normalization_fst; if (!normalization_fst_rxfilename.empty()) { @@ -414,9 +414,11 @@ int main(int argc, char *argv[]) { KALDI_ASSERT(sup_opts.frame_subsampling_factor == 1); - // We required alignments to be from the same chain model + // We require alignments to be from the same chain model + // If den_fst is not empty, it will be composed with the lattice + // before splitting. chain::SupervisionLatticeSplitter sup_lat_splitter( - sup_lat_splitter_opts, sup_opts, trans_model); + sup_lat_splitter_opts, sup_opts, trans_model, den_fst); for (; !feat_reader.Done(); feat_reader.Next()) { std::string key = feat_reader.Key(); @@ -475,9 +477,15 @@ int main(int argc, char *argv[]) { } } - sup_lat_splitter.LoadLattice(lat); + if (!sup_lat_splitter.LoadLattice(lat)) { + KALDI_WARN << "For utterance " << key + << ", FST was empty after composing with denominator FST. " + << "This should be extremely rare (a few per corpus, at most)"; + num_err++; + continue; + } - if (!ProcessFile(sup_opts, normalization_fst, den_fst, feats, + if (!ProcessFile(sup_opts, normalization_fst, feats, online_ivector_feats, online_ivector_period, trans_model, sup_lat_splitter, deriv_weights, graph_posteriors, min_post, From 97ffada63c698e596f5bd0c7c3aa3676616d1bf6 Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Mon, 22 Apr 2019 22:26:14 -0400 Subject: [PATCH 174/174] Update basic_layers.py --- egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py index c3c3f4e388c..2de57a33a07 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py @@ -652,7 +652,7 @@ def _generate_config(self): line = ('component name={0}.affine' ' type=NaturalGradientAffineComponent' ' input-dim={1} output-dim={2} {3}' - ''.format(self.name, cur_dim, output_dim, affine_options) + ''.format(self.name, cur_dim, output_dim, affine_options)) configs.append(line) line = ('component-node name={0}.affine' ' component={0}.affine input={1}'