diff --git a/.travis.yml b/.travis.yml index aa8e1f41..7ec89729 100644 --- a/.travis.yml +++ b/.travis.yml @@ -7,7 +7,7 @@ branches: notifications: email: false -dist: xenial +dist: bionic addons: apt: diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst index 7dca0519..694d1fc6 100644 --- a/docs/source/changelog.rst +++ b/docs/source/changelog.rst @@ -5,6 +5,15 @@ Changelog ========= +2.0.0a3 +------- + +- Further optimized corpus parsing algorithm to use multiprocessing and to load from saved files in temporary directories +- Revamped and fixed training using subsets of the corpora +- Fixed issue with training LDA systems +- Fixed a long-standing issue with words being marked as OOV due to improperly parsing clitics +- Updated logging to better capture when errors occur due to Kaldi binaries to better locate sources of issues + 2.0.0 ----- @@ -17,6 +26,7 @@ Currently under development with major changes, see :ref:`whats_new_2_0`. performance. This change should result in faster speaker adaptation. - Optimized corpus parsing algorithm to be O(n log n) instead of O(n^2) (`PR #194`_) + 1.1.0 ----- diff --git a/docs/source/classify_speakers.rst b/docs/source/classify_speakers.rst new file mode 100644 index 00000000..26cc0a76 --- /dev/null +++ b/docs/source/classify_speakers.rst @@ -0,0 +1,69 @@ +.. _classify_speakers: + +********************** +Speaker classification +********************** + +The Montreal Forced Aligner can use trained ivector models (see :ref:`train_ivector` for more information about training +these models) to classify or cluster utterances according to speakers. + +Steps to classify speakers: + + +1. Provided the steps in :ref:`installation` have been completed and you are in the same Conda/virtual environment that + MFA was installed in. +2. Run the following command, substituting the arguments with your own paths: + + .. code-block:: bash + + mfa classify_speakers corpus_directory ivector_extractor_path output_directory + +If the input uses TextGrids, the output TextGrids will have utterances sorted into tiers by each identified speaker. At +the moment, there is no way to retrain the classifier based on new data. + +If the input corpus directory does not have TextGrids associated with them, then the speaker classifier will output +speaker directories with a text file that contains all the utterances that were classified. + +Options available: + +.. option:: -h + --help + + Display help message for the command + +.. option:: -t DIRECTORY + --temp_directory DIRECTORY + + Temporary directory root to use for aligning, default is ``~/Documents/MFA`` + +.. option:: -j NUMBER + --num_jobs NUMBER + + Number of jobs to use; defaults to 3, set higher if you have more + processors available and would like to process faster + +.. option:: -s NUMBER + --num_speakers NUMBER + + Number of speakers to return. If ``--cluster`` is present, this specifies the number of clusters. Otherwise, + MFA will sort speakers according to the first pass classification and then takes the top X speakers, and reclassify + the utterances to only use those speakers. + +.. option:: --cluster + + MFA will perform clustering of utterance ivectors into the number of speakers specified by ``--num_speakers`` + +.. option:: -v + --verbose + + The aligner will print out more information if present + +.. option:: -d + --debug + + The aligner will run in debug mode + +.. option:: -c + --clean + + Forces removal of temporary files in ``~/Documents/MFA`` \ No newline at end of file diff --git a/docs/source/commands.rst b/docs/source/commands.rst index d580fe31..4247a9e6 100644 --- a/docs/source/commands.rst +++ b/docs/source/commands.rst @@ -17,7 +17,6 @@ Forced Alignment "train", "Train an acoustic model and export resulting alignment", :ref:`trained_alignment` "validate", "Validate a corpus to ensure there are no issues with the data format", :ref:`validating_data` "train_dictionary", "Estimate pronunciation probabilities from aligning a corpus", :ref:`training_dictionary` - "train_ivector", "Train an ivector extractor for speaker diarization", "" Transcription @@ -30,6 +29,19 @@ Transcription "transcribe", "Generate transcriptions using an acoustic model, dictionary, and language model", :ref:`transcribing` "train_lm", "Train a language model from a text corpus or from an existing language model", :ref:`training_lm` +Corpus creation +=============== + +.. csv-table:: + :header: "Command", "Description", "Link" + :widths: 10, 110, 40 + + "create_segments", "Use voice activity detection to create segments", :ref:`create_segments` + "train_ivector", "Train an ivector extractor for speaker classification", :ref:`train_ivector` + "classify_speakers", "Use ivector extractor to classify files or cluster them", :ref:`classify_speakers` + "annotator", "Run a GUI annotator program for editing and managing corpora", :ref:`annotator` + + Other utilities =============== @@ -39,7 +51,7 @@ Other utilities "download", "Download a model trained by MFA developers", :ref:`pretrained_models` "thirdparty", "Download and validate new third party binaries", :ref:`installation` - "annotator", "Run a GUI annotator program for editing and managing corpora", :ref:`annotator` + Grapheme-to-phoneme =================== @@ -49,4 +61,4 @@ Grapheme-to-phoneme :widths: 10, 110, 40 "g2p", "Use a G2P model to generate a pronunciation dictionary", :ref:`g2p_dictionary_generating` - "train_g2p", "Train a G2P model from a pronunciation dictionary", :ref:`g2p_model_training` \ No newline at end of file + "train_g2p", "Train a G2P model from a pronunciation dictionary", :ref:`g2p_model_training` diff --git a/docs/source/conf.py b/docs/source/conf.py index 6c0814c9..10dbdcc3 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -28,7 +28,8 @@ 'scipy', 'scipy.signal', 'scipy.io', 'librosa', 'librosa.core.spectrum', 'matplotlib', 'soundfile', - 'pyqt5', 'pyqtgraph', 'requests', 'requests.exceptions'] + 'pyqt5', 'pyqtgraph', 'requests', 'requests.exceptions', + 'sklearn', 'joblib', 'sklearn.naive_bayes'] for mod_name in MOCK_MODULES: sys.modules[mod_name] = mock.Mock() diff --git a/docs/source/configuration.rst b/docs/source/configuration.rst index 864478f5..3f5739cd 100644 --- a/docs/source/configuration.rst +++ b/docs/source/configuration.rst @@ -5,263 +5,13 @@ Configuration ************* -Global options -============== +Contents: -These options are used for aligning the full dataset (and as part of training). Increasing the values of them will -allow for more relaxed restrictions on alignment. Relaxing these restrictions can be particularly helpful for certain -kinds of files that are quite different from the training dataset (i.e., single word production data from experiments, -or longer stretches of audio). +.. toctree:: + :maxdepth: 3 - -.. csv-table:: - :header: "Parameter", "Default value", "Notes" - - "beam", 10, "Initial beam width to use for alignment" - "retry_beam", 40, "Beam width to use if initial alignment fails" - "transition_scale", 1.0, "Multiplier to scale transition costs" - "acoustic_scale", 0.1, "Multiplier to scale acoustic costs" - "self_loop_scale", 0.1, "Multiplier to scale self loop costs" - "boost_silence", 1.0, "1.0 is the value that does not affect probabilities" - - -.. _feature_config: - -Feature Configuration -===================== - -This section is only relevant for training, as the trained model will contain extractors and feature specification for -what it requires. - -.. csv-table:: - :header: "Parameter", "Default value", "Notes" - - "type", "mfcc", "Currently only MFCCs are supported" - "use_energy", "False", "Use energy in place of first MFCC" - "frame_shift", 10, "In milliseconds, determines time resolution" - -.. _training_config: - -Training configuration -====================== - -Global alignment options can be overwritten for each trainer (i.e., different beam settings at different stages of training). - -.. note:: - - Subsets are created by sorting the utterances by length, taking a larger subset (10 times the specified subset amount) - and then randomly sampling the specified subset amount from this larger subset. Utterances with transcriptions that - are only one word long are ignored. - -Monophone Configuration ------------------------ - -For the Kaldi recipe that monophone training is based on, see -https://github.com/kaldi-asr/kaldi/blob/master/egs/wsj/s5/steps/train_mono.sh - - -.. csv-table:: - :header: "Parameter", "Default value", "Notes" - - "subset", 0, "Number of utterances to use (0 uses the full corpus)" - "num_iterations", 40, "Number of training iterations" - "max_gaussians", 40, "Total number of gaussians" - "power", 0.25, "Exponent for gaussians based on occurrence counts" - - -Realignment iterations for training are calculated based on splitting the number of iterations into quarters. The first -quarter of training will perform realignment every iteration, the second quarter will perform realignment every other iteration, -and the final two quarters will perform realignment every third iteration. - - -Triphone Configuration ----------------------- - -For the Kaldi recipe that triphone training is based on, see -https://github.com/kaldi-asr/kaldi/blob/master/egs/wsj/s5/steps/train_deltas.sh - -.. csv-table:: - :header: "Parameter", "Default value", "Notes" - - "subset", 0, "Number of utterances to use (0 uses the full corpus)" - "num_iterations", 40, "Number of training iterations" - "max_gaussians", 40, "Total number of gaussians" - "power", 0.25, "Exponent for gaussians based on occurrence counts" - "num_leaves", 1000, "Number of states in the decision tree" - "cluster_threshold", -1, "Threshold for clustering leaves in decision tree" - - -LDA Configuration ------------------ - -For the Kaldi recipe that LDA training is based on, see -https://github.com/kaldi-asr/kaldi/blob/master/egs/wsj/s5/steps/train_lda_mllt.sh - -.. csv-table:: - :header: "Parameter", "Default value", "Notes" - - "subset", 0, "Number of utterances to use (0 uses the full corpus)" - "num_iterations", 40, "Number of training iterations" - "max_gaussians", 40, "Total number of gaussians" - "power", 0.25, "Exponent for gaussians based on occurrence counts" - "num_leaves", 1000, "Number of states in the decision tree" - "cluster_threshold", -1, "Threshold for clustering leaves in decision tree" - "lda_dimension", 40, "Dimension of resulting LDA features" - "random_prune", 4.0, "Ratio of random pruning to speed up MLLT" - - -LDA estimation will be performed every other iteration for the first quarter of iterations, and then one final LDA estimation -will be performed halfway through the training iterations. - -Speaker-adapted training (SAT) configuration --------------------------------------------- - -For the Kaldi recipe that SAT training is based on, see -https://github.com/kaldi-asr/kaldi/blob/master/egs/wsj/s5/steps/train_sat.sh - -.. csv-table:: - :header: "Parameter", "Default value", "Notes" - - "subset", 0, "Number of utterances to use (0 uses the full corpus)" - "num_iterations", 40, "Number of training iterations" - "max_gaussians", 1000, "Total number of gaussians" - "power", 0.25, "Exponent for gaussians based on occurrence counts" - "num_leaves", 1000, "Number of states in the decision tree" - "cluster_threshold", -1, "Threshold for clustering leaves in decision tree" - "silence_weight", 0.0, "Weight on silence in fMLLR estimation" - "fmllr_update_type", "full", "Type of fMLLR estimation" - - -fMLLR estimation will be performed every other iteration for the first quarter of iterations, and then one final fMLLR estimation -will be performed halfway through the training iterations. - - -.. _default_training_config: - -Default training config file ----------------------------- - -.. code-block:: yaml - - beam: 10 - retry_beam: 40 - - features: - type: "mfcc" - use_energy: false - frame_shift: 10 - - training: - - monophone: - num_iterations: 40 - max_gaussians: 1000 - subset: 2000 - boost_silence: 1.25 - - - triphone: - num_iterations: 35 - num_leaves: 2000 - max_gaussians: 10000 - cluster_threshold: -1 - subset: 5000 - boost_silence: 1.25 - power: 0.25 - - - lda: - num_leaves: 2500 - max_gaussians: 15000 - subset: 10000 - num_iterations: 35 - features: - splice_left_context: 3 - splice_right_context: 3 - - - sat: - num_leaves: 2500 - max_gaussians: 15000 - fmllr_power: 0.2 - silence_weight: 0.0 - fmllr_update_type: "diag" - subset: 10000 - features: - lda: true - - - sat: - num_leaves: 4200 - max_gaussians: 40000 - fmllr_power: 0.2 - silence_weight: 0.0 - fmllr_update_type: "diag" - subset: 30000 - features: - lda: true - fmllr: true - - -.. _align_config: - -Align configuration -=================== - -.. code-block:: yaml - - beam: 10 - retry_beam: 40 - -.. _transcribe_config: - -Transcriber configuration -========================= - -.. csv-table:: - :header: "Parameter", "Default value", "Notes" - - "beam", 13, "Beam for decoding" - "max_active", 7000, "Max active for decoding" - "lattice_beam", 6, "Beam width for decoding lattices" - "acoustic_scale", 0.083333, "Multiplier to scale acoustic costs" - "silence_weight", 0.01, "Weight on silence in fMLLR estimation" - "fmllr", true, "Flag for whether to perform speaker adaptation" - "first_beam", 10.0, "Beam for decoding in initial speaker-independent pass, only used if ``fmllr`` is true" - "first_max_active", 2000, "Max active for decoding in initial speaker-independent pass, only used if ``fmllr`` is true" - "fmllr_update_type", "full", "Type of fMLLR estimation" - -Default transcriber config --------------------------- - -.. code-block:: yaml - - beam: 13 - max_active: 7000 - lattice_beam: 6 - acoustic_scale: 0.083333 - silence_weight: 0.01 - fmllr: true - first_beam: 10.0 # Beam used in initial, speaker-indep. pass - first_max_active: 2000 # max-active used in initial pass. - fmllr_update_type: full - -.. _train_lm_config: - -Language model configuration -============================ - -.. csv-table:: - :header: "Parameter", "Default value", "Notes" - - "order", 3, "Order of language model" - "method", kneser_ney, "Method for smoothing" - "prune", false, "Flag for whether to output pruned models as well" - "prune_thresh_small", 0.0000003, "Threshold for pruning a small model, only used if ``prune`` is true" - "prune_thresh_medium", 0.0000001, "Threshold for pruning a medium model, only used if ``prune`` is true" - -Default language model config ------------------------------ - -.. code-block:: yaml - - order: 3 - method: kneser_ney - prune: false - prune_thresh_small: 0.0000003 - prune_thresh_medium: 0.0000001 + configuration_align.rst + configuration_transcription.rst + configuration_lm.rst + configuration_segment.rst + configuration_ivector.rst \ No newline at end of file diff --git a/docs/source/configuration_align.rst b/docs/source/configuration_align.rst new file mode 100644 index 00000000..f18b4de5 --- /dev/null +++ b/docs/source/configuration_align.rst @@ -0,0 +1,246 @@ + +.. _configuration_alignment: + +*********************** +Alignment Configuration +*********************** + +Global options +============== + +These options are used for aligning the full dataset (and as part of training). Increasing the values of them will +allow for more relaxed restrictions on alignment. Relaxing these restrictions can be particularly helpful for certain +kinds of files that are quite different from the training dataset (i.e., single word production data from experiments, +or longer stretches of audio). + + +.. csv-table:: + :header: "Parameter", "Default value", "Notes" + + "beam", 10, "Initial beam width to use for alignment" + "retry_beam", 40, "Beam width to use if initial alignment fails" + "transition_scale", 1.0, "Multiplier to scale transition costs" + "acoustic_scale", 0.1, "Multiplier to scale acoustic costs" + "self_loop_scale", 0.1, "Multiplier to scale self loop costs" + "boost_silence", 1.0, "1.0 is the value that does not affect probabilities" + + +.. _feature_config: + +Feature Configuration +===================== + +This section is only relevant for training, as the trained model will contain extractors and feature specification for +what it requires. + +.. csv-table:: + :header: "Parameter", "Default value", "Notes" + + "type", "mfcc", "Currently only MFCCs are supported" + "use_energy", "False", "Use energy in place of first MFCC" + "frame_shift", 10, "In milliseconds, determines time resolution" + +.. _training_config: + +Training configuration +====================== + +Global alignment options can be overwritten for each trainer (i.e., different beam settings at different stages of training). + +.. note:: + + Subsets are created by sorting the utterances by length, taking a larger subset (10 times the specified subset amount) + and then randomly sampling the specified subset amount from this larger subset. Utterances with transcriptions that + are only one word long are ignored. + +Monophone Configuration +----------------------- + +For the Kaldi recipe that monophone training is based on, see +https://github.com/kaldi-asr/kaldi/blob/master/egs/wsj/s5/steps/train_mono.sh + + +.. csv-table:: + :header: "Parameter", "Default value", "Notes" + + "subset", 0, "Number of utterances to use (0 uses the full corpus)" + "num_iterations", 40, "Number of training iterations" + "max_gaussians", 40, "Total number of gaussians" + "power", 0.25, "Exponent for gaussians based on occurrence counts" + + +Realignment iterations for training are calculated based on splitting the number of iterations into quarters. The first +quarter of training will perform realignment every iteration, the second quarter will perform realignment every other iteration, +and the final two quarters will perform realignment every third iteration. + + +Triphone Configuration +---------------------- + +For the Kaldi recipe that triphone training is based on, see +https://github.com/kaldi-asr/kaldi/blob/master/egs/wsj/s5/steps/train_deltas.sh + +.. csv-table:: + :header: "Parameter", "Default value", "Notes" + + "subset", 0, "Number of utterances to use (0 uses the full corpus)" + "num_iterations", 40, "Number of training iterations" + "max_gaussians", 40, "Total number of gaussians" + "power", 0.25, "Exponent for gaussians based on occurrence counts" + "num_leaves", 1000, "Number of states in the decision tree" + "cluster_threshold", -1, "Threshold for clustering leaves in decision tree" + + +LDA Configuration +----------------- + +For the Kaldi recipe that LDA training is based on, see +https://github.com/kaldi-asr/kaldi/blob/master/egs/wsj/s5/steps/train_lda_mllt.sh + +.. csv-table:: + :header: "Parameter", "Default value", "Notes" + + "subset", 0, "Number of utterances to use (0 uses the full corpus)" + "num_iterations", 40, "Number of training iterations" + "max_gaussians", 40, "Total number of gaussians" + "power", 0.25, "Exponent for gaussians based on occurrence counts" + "num_leaves", 1000, "Number of states in the decision tree" + "cluster_threshold", -1, "Threshold for clustering leaves in decision tree" + "lda_dimension", 40, "Dimension of resulting LDA features" + "random_prune", 4.0, "Ratio of random pruning to speed up MLLT" + + +LDA estimation will be performed every other iteration for the first quarter of iterations, and then one final LDA estimation +will be performed halfway through the training iterations. + +Speaker-adapted training (SAT) configuration +-------------------------------------------- + +For the Kaldi recipe that SAT training is based on, see +https://github.com/kaldi-asr/kaldi/blob/master/egs/wsj/s5/steps/train_sat.sh + +.. csv-table:: + :header: "Parameter", "Default value", "Notes" + + "subset", 0, "Number of utterances to use (0 uses the full corpus)" + "num_iterations", 40, "Number of training iterations" + "max_gaussians", 1000, "Total number of gaussians" + "power", 0.25, "Exponent for gaussians based on occurrence counts" + "num_leaves", 1000, "Number of states in the decision tree" + "cluster_threshold", -1, "Threshold for clustering leaves in decision tree" + "silence_weight", 0.0, "Weight on silence in fMLLR estimation" + "fmllr_update_type", "full", "Type of fMLLR estimation" + + +fMLLR estimation will be performed every other iteration for the first quarter of iterations, and then one final fMLLR estimation +will be performed halfway through the training iterations. + + +.. _default_training_config: + +Default training config file +---------------------------- + +.. code-block:: yaml + + beam: 10 + retry_beam: 40 + + features: + type: "mfcc" + use_energy: false + frame_shift: 10 + + training: + - monophone: + num_iterations: 40 + max_gaussians: 1000 + subset: 2000 + boost_silence: 1.25 + + - triphone: + num_iterations: 35 + num_leaves: 2000 + max_gaussians: 10000 + cluster_threshold: -1 + subset: 5000 + boost_silence: 1.25 + power: 0.25 + + - lda: + num_leaves: 2500 + max_gaussians: 15000 + subset: 10000 + num_iterations: 35 + features: + splice_left_context: 3 + splice_right_context: 3 + + - sat: + num_leaves: 2500 + max_gaussians: 15000 + fmllr_power: 0.2 + silence_weight: 0.0 + fmllr_update_type: "diag" + subset: 10000 + features: + lda: true + + - sat: + num_leaves: 4200 + max_gaussians: 40000 + fmllr_power: 0.2 + silence_weight: 0.0 + fmllr_update_type: "diag" + subset: 30000 + features: + lda: true + fmllr: true + +.. _1.0_training_config: + +Training configuration for 1.0 +------------------------------ + +.. code-block:: yaml + + beam: 10 + retry_beam: 40 + + features: + type: "mfcc" + use_energy: false + frame_shift: 10 + + training: + - monophone: + num_iterations: 40 + max_gaussians: 1000 + boost_silence: 1.0 + + - triphone: + num_iterations: 35 + num_leaves: 3100 + max_gaussians: 50000 + cluster_threshold: 100 + boost_silence: 1.0 + power: 0.25 + + - sat: + num_leaves: 3100 + max_gaussians: 50000 + fmllr_power: 0.2 + silence_weight: 0.0 + cluster_threshold: 100 + fmllr_update_type: "full" + + +.. _align_config: + +Align configuration +=================== + +.. code-block:: yaml + + beam: 10 + retry_beam: 40 diff --git a/docs/source/configuration_ivector.rst b/docs/source/configuration_ivector.rst new file mode 100644 index 00000000..cddda5eb --- /dev/null +++ b/docs/source/configuration_ivector.rst @@ -0,0 +1,54 @@ + +.. _configuration_ivector: + +********************* +Ivector Configuration +********************* + +For the Kaldi recipe that ivector extractor training is based on, see +https://github.com/kaldi-asr/kaldi/blob/master/egs/wsj/s5/steps/online/nnet2/train_diag_ubm.sh and +https://github.com/kaldi-asr/kaldi/blob/master/egs/wsj/s5/steps/online/nnet2/train_ivector_extractor.sh + +.. csv-table:: + :header: "Parameter", "Default value", "Notes" + + "ubm_num_iterations", 4, "Number of iterations for training UBM" + "ubm_num_gselect", 30, "Number of Gaussian-selection indices to use while training" + "ubm_num_frames", 500000, "Number of frames to keep in memory for initialization" + "ubm_num_gaussians", 256, "" + "ubm_num_iterations_init", 20, "Number of iteration to use when initializing UBM" + "ubm_initial_gaussian_proportion", 0.5, "Start with half the target number of Gaussians" + "ubm_min_gaussian_weight", 0.0001, "" + "ubm_remove_low_count_gaussians", True, "" + "ivector_dimension", 128, "Dimension of extracted ivectors" + "num_iterations", 10, "Number of training iterations" + "num_gselect", 20, "Gaussian-selection using diagonal model: number of Gaussians to select" + "posterior_scale", 1.0, "Scale on posteriors to correct for inter-frame correlation" + "silence_weight", 0.0, "" + "min_post", 0.025, "Minimum posterior to use (posteriors below this are pruned out)" + "num_samples_for_weights", 3, "" + "gaussian_min_count", 100, "" + "subsample", 5, "Speeds up training (samples every Xth frame)" + "max_count", 100, "" + "apply_cmn", True, "Flag for whether to apply CMVN to input features" + + +.. _default_training_config: + +Default training config file +---------------------------- + +.. code-block:: yaml + + features: + type: "mfcc" + use_energy: true + frame_shift: 10 + + training: + - ivector: + num_iterations: 10 + gaussian_min_count: 2 + silence_weight: 0.0 + posterior_scale: 0.1 + max_count: 100 diff --git a/docs/source/configuration_lm.rst b/docs/source/configuration_lm.rst new file mode 100644 index 00000000..c27b8c0d --- /dev/null +++ b/docs/source/configuration_lm.rst @@ -0,0 +1,31 @@ + +.. _lm_config: + +**************************** +Language model configuration +**************************** + +.. _train_lm_config: + +Language model configuration +============================ + +.. csv-table:: + :header: "Parameter", "Default value", "Notes" + + "order", 3, "Order of language model" + "method", kneser_ney, "Method for smoothing" + "prune", false, "Flag for whether to output pruned models as well" + "prune_thresh_small", 0.0000003, "Threshold for pruning a small model, only used if ``prune`` is true" + "prune_thresh_medium", 0.0000001, "Threshold for pruning a medium model, only used if ``prune`` is true" + +Default language model config +----------------------------- + +.. code-block:: yaml + + order: 3 + method: kneser_ney + prune: false + prune_thresh_small: 0.0000003 + prune_thresh_medium: 0.0000001 diff --git a/docs/source/configuration_segment.rst b/docs/source/configuration_segment.rst new file mode 100644 index 00000000..da73c16d --- /dev/null +++ b/docs/source/configuration_segment.rst @@ -0,0 +1,27 @@ + +.. _configuration_segments: + +***************************** +Create segments configuration +***************************** + + +.. csv-table:: + :header: "Parameter", "Default value", "Notes" + + "energy_threshold", 5.5, "Energy threshold above which a frame will be counted as voiced" + "energy_mean_scale", 0.5, "Proportion of the mean energy of the file that should be added to the energy_threshold" + "max_segment_length", 30, "Maximum length of segments before they do not get merged" + "min_pause_duration", 0.05, "Minimum unvoiced duration to split speech segments" + +.. _default_segment_config: + +Default training config file +---------------------------- + +.. code-block:: yaml + + energy_threshold: 5.5 + energy_mean_scale: 0.5 + max_segment_length: 30 + min_pause_duration: 0.05 diff --git a/docs/source/configuration_transcription.rst b/docs/source/configuration_transcription.rst new file mode 100644 index 00000000..2a0d3b5e --- /dev/null +++ b/docs/source/configuration_transcription.rst @@ -0,0 +1,34 @@ + +.. _transcribe_config: + +************************* +Transcriber configuration +************************* + +.. csv-table:: + :header: "Parameter", "Default value", "Notes" + + "beam", 13, "Beam for decoding" + "max_active", 7000, "Max active for decoding" + "lattice_beam", 6, "Beam width for decoding lattices" + "acoustic_scale", 0.083333, "Multiplier to scale acoustic costs" + "silence_weight", 0.01, "Weight on silence in fMLLR estimation" + "fmllr", true, "Flag for whether to perform speaker adaptation" + "first_beam", 10.0, "Beam for decoding in initial speaker-independent pass, only used if ``fmllr`` is true" + "first_max_active", 2000, "Max active for decoding in initial speaker-independent pass, only used if ``fmllr`` is true" + "fmllr_update_type", "full", "Type of fMLLR estimation" + +Default transcriber config +-------------------------- + +.. code-block:: yaml + + beam: 13 + max_active: 7000 + lattice_beam: 6 + acoustic_scale: 0.083333 + silence_weight: 0.01 + fmllr: true + first_beam: 10.0 # Beam used in initial, speaker-indep. pass + first_max_active: 2000 # max-active used in initial pass. + fmllr_update_type: full diff --git a/docs/source/corpus_creation.rst b/docs/source/corpus_creation.rst new file mode 100644 index 00000000..189593d5 --- /dev/null +++ b/docs/source/corpus_creation.rst @@ -0,0 +1,27 @@ +.. _corpus_creation: + +************************* +Corpus creation utilities +************************* + +MFA now contains several command line utilities for helping to create corpora from scratch. The main workflow is as +follows: + +1. If the corpus made up of long sound file that need segmenting, :ref:`create_segments` +2. If the corpus does not contain transcriptions, transcribe utterances using existing acoustic models, + language models, and dictionaries (:ref:`transcribing`) +3. Use the annotator tool to fix up any errors (:ref:`annotator`) +4. As necessary, bootstrap better transcriptions: + + 1. Retrain language model with new fixed transcriptions (:ref:`training_lm`) + 2. Train dictionary pronunciation probabilities (:ref:`training_dictionary`) + +.. toctree:: + :maxdepth: 3 + + create_segments.rst + train_ivector.rst + classify_speakers.rst + transcribing.rst + training_lm.rst + training_dictionary.rst \ No newline at end of file diff --git a/docs/source/create_segments.rst b/docs/source/create_segments.rst new file mode 100644 index 00000000..49421ae7 --- /dev/null +++ b/docs/source/create_segments.rst @@ -0,0 +1,65 @@ +.. _create_segments: + +*************** +Create segments +*************** + +The Montreal Forced Aligner can use Voice Activity Detection (VAD) capabilities from Kaldi to generate segments from +a longer sound file. + +Steps to create segments: + + +1. Provided the steps in :ref:`installation` have been completed and you are in the same Conda/virtual environment that + MFA was installed in. +2. Run the following command, substituting the arguments with your own paths: + + .. code-block:: bash + + mfa create_segments corpus_directory output_directory + + +.. note:: + + The default configuration for VAD uses configuration values based on quiet speech. The algorithm is based on energy, + so if your recordings are more noisy, you may need to adjust the configuration. See :ref:`configuration_segments` + for more information on changing these parameters. + + +Options available: + +.. option:: -h + --help + + Display help message for the command + +.. option:: --config_path PATH + + Path to a YAML config file that will specify the alignment configuration. See + :ref:`align_config` for more details. + +.. option:: -t DIRECTORY + --temp_directory DIRECTORY + + Temporary directory root to use for aligning, default is ``~/Documents/MFA`` + +.. option:: -j NUMBER + --num_jobs NUMBER + + Number of jobs to use; defaults to 3, set higher if you have more + processors available and would like to process faster + +.. option:: -v + --verbose + + The aligner will print out more information if present + +.. option:: -d + --debug + + The aligner will run in debug mode + +.. option:: -c + --clean + + Forces removal of temporary files in ``~/Documents/MFA`` \ No newline at end of file diff --git a/docs/source/data_prep.rst b/docs/source/data_prep.rst index 789cf8df..bd8bd70b 100644 --- a/docs/source/data_prep.rst +++ b/docs/source/data_prep.rst @@ -17,13 +17,24 @@ Prior to running the aligner, make sure the following are set up: The sound files and the orthographic annotations should be contained in one directory structured as follows:: - +-- corpus_directory + +-- textgrid_corpus_directory | --- recording1.wav | --- recording1.TextGrid | --- recording2.wav | --- recording2.TextGrid | --- ... + +-- prosodylab_corpus_directory + | +-- speaker1 + | --- recording1.wav + | --- recording1.lab + | --- recording2.wav + | --- recording2.lab + | +-- speaker2 + | --- recording3.wav + | --- recording3.lab + | --- ... + .. note:: diff --git a/docs/source/index.rst b/docs/source/index.rst index ac896b9e..bf6686df 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -19,11 +19,9 @@ Contents: data_validation.rst aligning.rst example.rst - configuration.rst + corpus_creation.rst g2p.rst - transcribing.rst - training_dictionary.rst - training_lm.rst + configuration.rst annotator.rst pretrained_models.rst changelog.rst diff --git a/docs/source/installation.rst b/docs/source/installation.rst index 0d97aea1..64c402e7 100644 --- a/docs/source/installation.rst +++ b/docs/source/installation.rst @@ -19,7 +19,7 @@ All platforms For Linux" and use the Bash console to continue the instructions. 1. Install Anaconda/Miniconda (https://docs.conda.io/en/latest/miniconda.html) -2. Create new environment via :code:`conda create -n aligner -c conda-forge python=3.8 openfst=1.7.6 pynini=2.1.0 ngram=1.3.9 baumwelch=0.3.1` +2. Create new environment via :code:`conda create -n aligner -c conda-forge openblas python=3.8 openfst=1.7.6 pynini=2.1.0 ngram=1.3.9 baumwelch=0.3.1` 3. Run :code:`pip install montreal-forced-aligner` 4. Install third-party binaries via :code:`mfa thirdparty download` (see also :ref:`collect_binaries` to collect locally built binaries) @@ -42,7 +42,8 @@ Files created when using the Montreal Forced Aligner The aligner will save data and logs for the models it trains in a new folder, ``Documents/MFA`` (which it creates in your user's home directory). If a model for a corpus already exists in MFA, it will use any existing models if you try to align it again. -(If this is not desired, delete or move the old model folder.) You can specify your own temporary directory by using the ``-t`` +(If this is not desired, delete or move the old model folder or use the ``--clean`` flag.) +You can specify your own temporary directory by using the ``-t`` flag when calling the executable. Supported functionality @@ -51,12 +52,16 @@ Supported functionality Currently in the 2.0 alpha, supported functionality is somewhat fragmented across platforms. Native support for features is as follows. Note that Windows can use Windows Subsystem for Linux to use the Linux version as necessary. - .. csv-table:: :header: "Feature", "Linux support", "Windows support", "MacOS support" "Alignment", "Yes", "Yes", "Yes" "G2P", "Yes", "No", "Yes" - "Transcribe", "Yes", "No", "Need kaldi binaries built locally" + "Transcribe", "Yes", "Yes", "Yes" "Train LM", "Yes", "No", "Yes" - "Train dictionary", "Yes", "No", "Need kaldi binaries built locally" + "Train dictionary", "Yes", "Yes", "Yes" + +.. warning:: + + The prebuilt Kaldi binaries were built on Ubuntu 18.04 and MacOSX 10.15 (Catalina). If you're using an older version + of either of those, follow the instructions in :ref:`collect_binaries`. \ No newline at end of file diff --git a/docs/source/news.rst b/docs/source/news.rst index 378fd279..e31ce73f 100644 --- a/docs/source/news.rst +++ b/docs/source/news.rst @@ -67,8 +67,7 @@ See also :ref:`annotator` for more information on using the annotation GUI. Transcription ------------- -Transcribing audio files is not currently implemented, but is planned for eventual release. As part of this functionality, -MFA will support: +MFA now supports: - Transcribing a corpus of sound files using an acoustic model, dictionary, and language model, see :ref:`transcribing` for more information. diff --git a/docs/source/pretrained_models.rst b/docs/source/pretrained_models.rst index 4712d9c9..b4b5878f 100644 --- a/docs/source/pretrained_models.rst +++ b/docs/source/pretrained_models.rst @@ -123,6 +123,12 @@ .. _`French Prosodylab dictionary`: https://raw.githubusercontent.com/MontrealCorpusTools/mfa-models/master/dictionary/fr.dict .. _`German Prosodylab dictionary`: https://raw.githubusercontent.com/MontrealCorpusTools/mfa-models/master/dictionary/de.dict +.. _`GlobalPhone language models`: https://www.csl.uni-bremen.de/GlobalPhone/ + +.. _`LibriSpeech language models`: https://www.openslr.org/11/ + +.. _`FalaBrasil language models`: https://gitlab.com/fb-asr/fb-asr-resources/kaldi-resources/-/tree/master/lm +.. _`FalaBrasil dictionary`: https://gitlab.com/fb-nlp/nlp-resources/-/tree/master/res .. _pretrained_models: @@ -151,34 +157,34 @@ by users will be periodically added. If you would like to contribute your traine at michael.e.mcauliffe@gmail.com. .. csv-table:: - :header: "Language", "Link", "Corpus", "Phone set" - - "Arabic", "Not available yet", "GlobalPhone", "GlobalPhone" - "Bulgarian", `Bulgarian acoustic model`_, "GlobalPhone", "GlobalPhone" - "Croatian", `Croatian acoustic model`_, "GlobalPhone", "GlobalPhone" - "Czech", `Czech acoustic model`_, "GlobalPhone", "GlobalPhone" - "English", `English acoustic model`_, "LibriSpeech", "Arpabet (stressed)" - "French (FR)", `French (FR) acoustic model`_, "GlobalPhone", "GlobalPhone" - "French (FR)", `French (Prosodylab) acoustic model`_, "GlobalPhone", "Prosodylab [1]_" - "French (QC)", `French (QC) acoustic model`_, "Lab speech", "Prosodylab [1]_" - "German", `German acoustic model`_, "GlobalPhone", "GlobalPhone" - "German", `German (Prosodylab) acoustic model`_, "GlobalPhone", "Prosodylab [3]_" - "Hausa", `Hausa acoustic model`_, "GlobalPhone", "GlobalPhone" - "Japanese", "Not available yet", "GlobalPhone", "GlobalPhone" - "Korean", `Korean acoustic model`_, "GlobalPhone", "GlobalPhone" - "Mandarin", `Mandarin acoustic model`_, "GlobalPhone", "GlobalPhone" - "Polish", `Polish acoustic model`_, "GlobalPhone", "GlobalPhone" - "Portuguese", `Portuguese acoustic model`_, "GlobalPhone", "GlobalPhone" - "Russian", `Russian acoustic model`_, "GlobalPhone", "GlobalPhone" - "Spanish", `Spanish acoustic model`_, "GlobalPhone", "GlobalPhone" - "Swahili", `Swahili acoustic model`_, "GlobalPhone", "GlobalPhone" - "Swedish", `Swedish acoustic model`_, "GlobalPhone", "GlobalPhone" - "Tamil", "Not available yet", "GlobalPhone", "GlobalPhone" - "Thai", `Thai acoustic model`_, "GlobalPhone", "GlobalPhone" - "Turkish", `Turkish acoustic model`_, "GlobalPhone", "GlobalPhone" - "Ukrainian", `Ukrainian acoustic model`_, "GlobalPhone", "GlobalPhone" - "Vietnamese", `Vietnamese acoustic model`_, "GlobalPhone", "GlobalPhone" - "Wu", "Not available yet", "GlobalPhone", "GlobalPhone" + :header: "Language", "Link", "Corpus", "Number of speakers", "Audio (hours)", "Phone set" + + "Arabic", `Arabic acoustic model`_, "GlobalPhone", 80, 19.0, "GlobalPhone" + "Bulgarian", `Bulgarian acoustic model`_, "GlobalPhone", 79, 21.4, "GlobalPhone" + "Croatian", `Croatian acoustic model`_, "GlobalPhone", 94, 15.9, "GlobalPhone" + "Czech", `Czech acoustic model`_, "GlobalPhone", 102, 31.7, "GlobalPhone" + "English", `English acoustic model`_, "LibriSpeech", 2484, 982.3, "Arpabet (stressed)" + "French (FR)", `French (FR) acoustic model`_, "GlobalPhone", 100, 26.9, "GlobalPhone" + "French (FR)", `French (Prosodylab) acoustic model`_, "GlobalPhone", 100, 26.9, "Prosodylab [1]_" + "French (QC)", `French (QC) acoustic model`_, "Lab speech", "N/A", "N/A", "Prosodylab [1]_" + "German", `German acoustic model`_, "GlobalPhone", 77, 18, "GlobalPhone" + "German", `German (Prosodylab) acoustic model`_, "GlobalPhone", 77, 18, "Prosodylab [3]_" + "Hausa", `Hausa acoustic model`_, "GlobalPhone", 103, 8.7, "GlobalPhone" + "Japanese", "Not available yet", "GlobalPhone", 144, 34, "GlobalPhone" + "Korean", `Korean acoustic model`_, "GlobalPhone", 101, 20.8, "GlobalPhone" + "Mandarin", `Mandarin acoustic model`_, "GlobalPhone", 132, 31.2, "GlobalPhone" + "Polish", `Polish acoustic model`_, "GlobalPhone", 99, 24.6, "GlobalPhone" + "Portuguese", `Portuguese acoustic model`_, "GlobalPhone", 101, 26.3, "GlobalPhone" + "Russian", `Russian acoustic model`_, "GlobalPhone", 115, 26.5, "GlobalPhone" + "Spanish", `Spanish acoustic model`_, "GlobalPhone", 102, 22.1, "GlobalPhone" + "Swahili", `Swahili acoustic model`_, "GlobalPhone", 70, 11.1, "GlobalPhone" + "Swedish", `Swedish acoustic model`_, "GlobalPhone", 98, 21.7, "GlobalPhone" + "Tamil", "Not available yet", "GlobalPhone", "N/A", "N/A", "GlobalPhone" + "Thai", `Thai acoustic model`_, "GlobalPhone", 98, 28.2, "GlobalPhone" + "Turkish", `Turkish acoustic model`_, "GlobalPhone", 100, 17.1, "GlobalPhone" + "Ukrainian", `Ukrainian acoustic model`_, "GlobalPhone", 119, 14.1, "GlobalPhone" + "Vietnamese", `Vietnamese acoustic model`_, "GlobalPhone", 129, 19.7, "GlobalPhone" + "Wu", "Not available yet", "GlobalPhone", 41, 9.3, "GlobalPhone" .. _pretrained_g2p: @@ -256,12 +262,24 @@ can get a full list of the currently available dictionaries via :code:`mfa downl by users will be periodically added. If you would like to contribute your dictionaries, please contact Michael McAuliffe at michael.e.mcauliffe@gmail.com. -+-----------------+-----------------------------------------------+------------------------+------------------------+ -| Language | Link | Orthography system | Phone set | -+=================+===============================================+========================+========================+ -| English | `English pronunciation dictionary`_ | Latin | Arpabet (stressed) | -+-----------------+-----------------------------------------------+------------------------+------------------------+ -| French | `French Prosodylab dictionary`_ | Latin | Prosodylab French | -+-----------------+-----------------------------------------------+------------------------+------------------------+ -| German | `German Prosodylab dictionary`_ | Latin | Prosodylab German | -+-----------------+-----------------------------------------------+------------------------+------------------------+ +.. csv-table:: + :header: "Language", "Link", "Orthography system", "Phone set" + + "English", `English pronunciation dictionary`_ , "Latin", "Arpabet (stressed)" + "French", `French Prosodylab dictionary`_, "Latin", "Prosodylab French" + "German", `German Prosodylab dictionary`_, "Latin", "Prosodylab German" + "Brazilian Portuguese", `FalaBrasil dictionary`_, "Latin", "" + +.. _language_models: + +Available language models +========================= + +There are several places that contain pretrained language models that can be imported to MFA. + +.. csv-table:: + :header: "Source", "Language", "Link" + + "GlobalPhone", "Various languages", `GlobalPhone language models`_ + "LibriSpeech", "English", `LibriSpeech language models`_ + "FalaBrasil", "Brazilian Portuguese", `FalaBrasil language models`_ diff --git a/docs/source/train_ivector.rst b/docs/source/train_ivector.rst new file mode 100644 index 00000000..a980a4f2 --- /dev/null +++ b/docs/source/train_ivector.rst @@ -0,0 +1,65 @@ +.. _train_ivector: + +***************************** +Training an ivector extractor +***************************** + +The Montreal Forced Aligner can train ivector extractors using an acoustic model for generating alignments. As part +of this training process, a classifier is built in that can be used as part of :ref:`classify_speakers`. + +Steps to train ivector extractor: + +1. Provided the steps in :ref:`installation` have been completed and you are in the same Conda/virtual environment that + MFA was installed in. +2. Run the following command, substituting the arguments with your own paths: + + .. code-block:: bash + + mfa train_ivector corpus_directory dictionary_path acoustic_model_path output_model_path + + +Options available: + +.. option:: -h + --help + + Display help message for the command + +.. option:: --config_path PATH + + Path to a YAML config file that will specify the training configuration. See + :ref:`configuration_ivector` for more details. + +.. option:: -s NUMBER + --speaker_characters NUMBER + + Number of characters to use to identify speakers; if not specified, + the aligner assumes that the directory name is the identifier for the + speaker. Additionally, it accepts the value ``prosodylab`` to use the second field of a ``_`` delimited file name, + following the convention of labelling production data in the ProsodyLab at McGill. + +.. option:: -t DIRECTORY + --temp_directory DIRECTORY + + Temporary directory root to use for aligning, default is ``~/Documents/MFA`` + +.. option:: -j NUMBER + --num_jobs NUMBER + + Number of jobs to use; defaults to 3, set higher if you have more + processors available and would like to process faster + +.. option:: -v + --verbose + + The aligner will print out more information if present + +.. option:: -d + --debug + + The aligner will run in debug mode + +.. option:: -c + --clean + + Forces removal of temporary files in ``~/Documents/MFA`` \ No newline at end of file diff --git a/montreal_forced_aligner/aligner/base.py b/montreal_forced_aligner/aligner/base.py index af3ad0b7..14350ae9 100644 --- a/montreal_forced_aligner/aligner/base.py +++ b/montreal_forced_aligner/aligner/base.py @@ -1,9 +1,13 @@ import os +import logging from .. import __version__ from ..multiprocessing import compile_information from ..config import TEMP_DIR +from ..helper import log_kaldi_errors +from ..exceptions import KaldiProcessingError + class BaseAligner(object): """ @@ -22,16 +26,30 @@ class BaseAligner(object): If not specified, it will be set to ``~/Documents/MFA`` call_back : callable, optional Specifies a call back function for alignment + debug : bool + Flag for running in debug mode, defaults to false + verbose : bool + Flag for running in verbose mode, defaults to false """ def __init__(self, corpus, dictionary, align_config, temp_directory=None, - call_back=None, debug=False, verbose=False): + call_back=None, debug=False, verbose=False, logger=None): self.align_config = align_config self.corpus = corpus self.dictionary = dictionary if not temp_directory: temp_directory = TEMP_DIR self.temp_directory = temp_directory + os.makedirs(self.temp_directory, exist_ok=True) + if logger is None: + self.log_file = os.path.join(self.temp_directory, 'aligner.log') + self.logger = logging.getLogger('corpus_setup') + self.logger.setLevel(logging.INFO) + handler = logging.FileHandler(self.log_file, 'w', 'utf-8') + handler.setFormatter = logging.Formatter('%(name)s %(message)s') + self.logger.addHandler(handler) + else: + self.logger = logger self.call_back = call_back if self.call_back is None: self.call_back = print @@ -42,7 +60,11 @@ def __init__(self, corpus, dictionary, align_config, temp_directory=None, def setup(self): self.dictionary.write() self.corpus.initialize_corpus(self.dictionary) - self.align_config.feature_config.generate_features(self.corpus) + try: + self.align_config.feature_config.generate_features(self.corpus) + except Exception as e: + if isinstance(e, KaldiProcessingError): + log_kaldi_errors(e.error_logs, self.logger) @property def meta(self): @@ -60,9 +82,8 @@ def compile_information(self, model_directory, output_directory): with open(issue_path, 'w', encoding='utf8') as f: for u, r in sorted(issues.items()): f.write('{}\t{}\n'.format(u, r)) - print('There were {} segments/files not aligned. ' - 'Please see {} for more details on why alignment failed for these files.'.format(len(issues), - issue_path)) + self.logger.warning('There were {} segments/files not aligned. Please see {} for more details on why ' + 'alignment failed for these files.'.format(len(issues), issue_path)) def export_textgrids(self, output_directory): """ diff --git a/montreal_forced_aligner/aligner/pretrained.py b/montreal_forced_aligner/aligner/pretrained.py index 441e8877..d8e3bde7 100644 --- a/montreal_forced_aligner/aligner/pretrained.py +++ b/montreal_forced_aligner/aligner/pretrained.py @@ -4,6 +4,8 @@ from .base import BaseAligner from ..multiprocessing import (align, convert_ali_to_textgrids, compile_train_graphs, generate_pronunciations) +from ..exceptions import KaldiProcessingError +from ..helper import log_kaldi_errors def parse_transitions(path, phones_path): @@ -51,10 +53,10 @@ class PretrainedAligner(BaseAligner): def __init__(self, corpus, dictionary, acoustic_model, align_config, temp_directory=None, - call_back=None, debug=False, verbose=False): + call_back=None, debug=False, verbose=False, logger=None): self.acoustic_model = acoustic_model super(PretrainedAligner, self).__init__(corpus, dictionary, align_config, temp_directory, - call_back, debug, verbose) + call_back, debug, verbose, logger) self.align_config.data_directory = corpus.split_directory() self.acoustic_model.export_model(self.align_directory) log_dir = os.path.join(self.align_directory, 'log') @@ -75,14 +77,28 @@ def setup(self): super(PretrainedAligner, self).setup() def align(self): - compile_train_graphs(self.align_directory, self.dictionary.output_directory, - self.align_config.data_directory, self.corpus.num_jobs, self.align_config) - self.acoustic_model.feature_config.generate_features(self.corpus) - log_dir = os.path.join(self.align_directory, 'log') - os.makedirs(log_dir, exist_ok=True) - align('final', self.align_directory, self.align_config.data_directory, - self.dictionary.optional_silence_csl, - self.corpus.num_jobs, self.align_config) + done_path = os.path.join(self.align_directory, 'done') + dirty_path = os.path.join(self.align_directory, 'dirty') + if os.path.exists(done_path): + self.logger.info('Alignment already done, skipping.') + return + try: + compile_train_graphs(self.align_directory, self.dictionary.output_directory, + self.align_config.data_directory, self.corpus.num_jobs, self.align_config) + self.acoustic_model.feature_config.generate_features(self.corpus) + log_dir = os.path.join(self.align_directory, 'log') + os.makedirs(log_dir, exist_ok=True) + align('final', self.align_directory, self.align_config.data_directory, + self.dictionary.optional_silence_csl, + self.corpus.num_jobs, self.align_config) + except Exception as e: + with open(dirty_path, 'w'): + pass + if isinstance(e, KaldiProcessingError): + log_kaldi_errors(e.error_logs, self.logger) + raise + with open(done_path, 'w'): + pass def export_textgrids(self, output_directory): """ diff --git a/montreal_forced_aligner/aligner/trainable.py b/montreal_forced_aligner/aligner/trainable.py index 7300504c..35447596 100644 --- a/montreal_forced_aligner/aligner/trainable.py +++ b/montreal_forced_aligner/aligner/trainable.py @@ -1,6 +1,9 @@ from ..multiprocessing import (convert_ali_to_textgrids) from .base import BaseAligner +from ..helper import log_kaldi_errors +from ..exceptions import KaldiProcessingError + class TrainableAligner(BaseAligner): """ @@ -24,16 +27,22 @@ class TrainableAligner(BaseAligner): """ def __init__(self, corpus, dictionary, training_config, align_config, temp_directory=None, - call_back=None, debug=False, verbose=False): + call_back=None, debug=False, verbose=False, logger=None): self.training_config = training_config super(TrainableAligner, self).__init__(corpus, dictionary, align_config, temp_directory, - call_back, debug, verbose) + call_back, debug, verbose, logger) def setup(self): - self.dictionary.write() + if self.dictionary is not None: + self.dictionary.set_word_set(self.corpus.word_set) + self.dictionary.write() self.corpus.initialize_corpus(self.dictionary) for identifier, trainer in self.training_config.items(): - trainer.feature_config.generate_features(self.corpus) + try: + trainer.feature_config.generate_features(self.corpus) + except Exception as e: + if isinstance(e, KaldiProcessingError): + log_kaldi_errors(e.error_logs, self.logger) break def save(self, path): @@ -46,7 +55,7 @@ def save(self, path): Path to save acoustic model and dictionary """ self.training_config.values()[-1].save(path) - print('Saved model to {}'.format(path)) + self.logger.info('Saved model to {}'.format(path)) @property def meta(self): @@ -62,6 +71,8 @@ def meta(self): def train(self): previous = None for identifier, trainer in self.training_config.items(): + trainer.debug = self.debug + trainer.logger = self.logger if previous is not None: previous.align(trainer.subset) trainer.init_training(identifier, self.temp_directory, self.corpus, self.dictionary, previous) diff --git a/montreal_forced_aligner/command_line/align.py b/montreal_forced_aligner/command_line/align.py index 2780a640..17f113a6 100644 --- a/montreal_forced_aligner/command_line/align.py +++ b/montreal_forced_aligner/command_line/align.py @@ -12,25 +12,12 @@ from montreal_forced_aligner.config import TEMP_DIR, align_yaml_to_config, load_basic_align from montreal_forced_aligner.utils import get_available_acoustic_languages, get_pretrained_acoustic_path, \ get_available_dict_languages, get_dictionary_path +from montreal_forced_aligner.helper import setup_logger from montreal_forced_aligner.exceptions import ArgumentError -class DummyArgs(object): - def __init__(self): - self.corpus_directory = '' - self.dictionary_path = '' - self.acoustic_model_path = '' - self.speaker_characters = 0 - self.num_jobs = 0 - self.verbose = False - self.clean = True - self.fast = True - self.debug = False - self.temp_directory = None - self.config_path = '' - - -def align_corpus(args): +def align_corpus(args, unknown_args=None): + command = 'align' all_begin = time.time() if not args.temp_directory: temp_dir = TEMP_DIR @@ -41,64 +28,90 @@ def align_corpus(args): args.corpus_directory = os.path.dirname(args.corpus_directory) corpus_name = os.path.basename(args.corpus_directory) data_directory = os.path.join(temp_dir, corpus_name) + logger = setup_logger(command, data_directory) + if args.config_path: + align_config = align_yaml_to_config(args.config_path) + else: + align_config = load_basic_align() + if unknown_args: + align_config.update_from_args(unknown_args) conf_path = os.path.join(data_directory, 'config.yml') + if getattr(args, 'clean', False) and os.path.exists(data_directory): + logger.info('Cleaning old directory!') + shutil.rmtree(data_directory, ignore_errors=True) if os.path.exists(conf_path): with open(conf_path, 'r') as f: conf = yaml.load(f, Loader=yaml.SafeLoader) else: conf = {'dirty': False, - 'begin': time.time(), + 'begin': all_begin, 'version': __version__, - 'type': 'align', + 'type': command, 'corpus_directory': args.corpus_directory, - 'dictionary_path': args.dictionary_path} + 'dictionary_path': args.dictionary_path, + 'acoustic_model_path': args.acoustic_model_path} if getattr(args, 'clean', False) \ - or conf['dirty'] or conf['type'] != 'align' \ + or conf['dirty'] or conf['type'] != command \ or conf['corpus_directory'] != args.corpus_directory \ or conf['version'] != __version__ \ or conf['dictionary_path'] != args.dictionary_path: - shutil.rmtree(data_directory, ignore_errors=True) + logger.warning( + 'WARNING: Using old temp directory, this might not be ideal for you, use the --clean flag to ensure no ' + 'weird behavior for previous versions of the temporary directory.') + if conf['dirty']: + logger.debug('Previous run ended in an error (maybe ctrl-c?)') + if conf['type'] != command: + logger.debug('Previous run was a different subcommand than {} (was {})'.format(command, conf['type'])) + if conf['corpus_directory'] != args.corpus_directory: + logger.debug('Previous run used source directory ' + 'path {} (new run: {})'.format(conf['corpus_directory'], args.corpus_directory)) + if conf['version'] != __version__: + logger.debug('Previous run was on {} version (new run: {})'.format(conf['version'], __version__)) + if conf['dictionary_path'] != args.dictionary_path: + logger.debug('Previous run used dictionary path {} ' + '(new run: {})'.format(conf['dictionary_path'], args.dictionary_path)) + if conf['acoustic_model_path'] != args.acoustic_model_path: + logger.debug('Previous run used acoustic model path {} ' + '(new run: {})'.format(conf['acoustic_model_path'], args.acoustic_model_path)) os.makedirs(data_directory, exist_ok=True) os.makedirs(args.output_directory, exist_ok=True) try: corpus = AlignableCorpus(args.corpus_directory, data_directory, - speaker_characters=args.speaker_characters, - num_jobs=args.num_jobs) + speaker_characters=args.speaker_characters, + num_jobs=args.num_jobs, logger=logger, use_mp=align_config.use_mp) if corpus.issues_check: - print('WARNING: Some issues parsing the corpus were detected. ' + logger.warning('WARNING: Some issues parsing the corpus were detected. ' 'Please run the validator to get more information.') - print(corpus.speaker_utterance_info()) + logger.info(corpus.speaker_utterance_info()) + dictionary = Dictionary(args.dictionary_path, data_directory, word_set=corpus.word_set, logger=logger) acoustic_model = AcousticModel(args.acoustic_model_path) - dictionary = Dictionary(args.dictionary_path, data_directory, word_set=corpus.word_set) acoustic_model.validate(dictionary) begin = time.time() - if args.config_path: - align_config = align_yaml_to_config(args.config_path) - else: - align_config = load_basic_align() a = PretrainedAligner(corpus, dictionary, acoustic_model, align_config, temp_directory=data_directory, - debug=getattr(args, 'debug', False)) - if args.debug: - print('Setup pretrained aligner in {} seconds'.format(time.time() - begin)) + debug=getattr(args, 'debug', False), logger=logger) + logger.debug('Setup pretrained aligner in {} seconds'.format(time.time() - begin)) a.verbose = args.verbose begin = time.time() a.align() - if args.debug: - print('Performed alignment in {} seconds'.format(time.time() - begin)) + logger.debug('Performed alignment in {} seconds'.format(time.time() - begin)) begin = time.time() a.export_textgrids(args.output_directory) - if args.debug: - print('Exported TextGrids in {} seconds'.format(time.time() - begin)) - print('Done! Everything took {} seconds'.format(time.time() - all_begin)) + logger.debug('Exported TextGrids in {} seconds'.format(time.time() - begin)) + logger.info('All done!') + logger.debug('Done! Everything took {} seconds'.format(time.time() - all_begin)) except Exception as _: conf['dirty'] = True raise finally: + handlers = logger.handlers[:] + for handler in handlers: + handler.close() + logger.removeHandler(handler) with open(conf_path, 'w') as f: yaml.dump(conf, f) @@ -131,7 +144,7 @@ def validate_args(args, downloaded_acoustic_models, download_dictionaries): args.acoustic_model_path.lower(), ', '.join(downloaded_acoustic_models))) -def run_align_corpus(args, downloaded_acoustic_models=None, download_dictionaries=None): +def run_align_corpus(args, unknown_args=None, downloaded_acoustic_models=None, download_dictionaries=None): if downloaded_acoustic_models is None: downloaded_acoustic_models = get_available_acoustic_languages() if download_dictionaries is None: @@ -144,14 +157,17 @@ def run_align_corpus(args, downloaded_acoustic_models=None, download_dictionarie args.corpus_directory = args.corpus_directory.rstrip('/').rstrip('\\') validate_args(args, downloaded_acoustic_models, download_dictionaries) - align_corpus(args) + align_corpus(args, unknown_args) if __name__ == '__main__': # pragma: no cover mp.freeze_support() - from montreal_forced_aligner.command_line.mfa import align_parser, fix_path, unfix_path, acoustic_languages, dict_languages + from montreal_forced_aligner.command_line.mfa import align_parser, fix_path, unfix_path, acoustic_languages, \ + dict_languages - align_args = align_parser.parse_args() + align_args, unknown = align_parser.parse_known_args() + print(align_args) + print(unknown) fix_path() - run_align_corpus(align_args, acoustic_languages, dict_languages) + run_align_corpus(align_args, unknown, acoustic_languages, dict_languages) unfix_path() diff --git a/montreal_forced_aligner/command_line/classify_speakers.py b/montreal_forced_aligner/command_line/classify_speakers.py new file mode 100644 index 00000000..7087c829 --- /dev/null +++ b/montreal_forced_aligner/command_line/classify_speakers.py @@ -0,0 +1,144 @@ +import shutil +import os +import time +import multiprocessing as mp +import yaml + +from montreal_forced_aligner import __version__ +from montreal_forced_aligner.corpus.transcribe_corpus import TranscribeCorpus +from montreal_forced_aligner.speaker_classifier import SpeakerClassifier +from montreal_forced_aligner.models import IvectorExtractor +from montreal_forced_aligner.config import TEMP_DIR, classification_yaml_to_config, load_basic_classification +from montreal_forced_aligner.utils import get_available_ivector_languages, get_pretrained_ivector_path +from montreal_forced_aligner.helper import setup_logger +from montreal_forced_aligner.exceptions import ArgumentError + + +def classify_speakers(args): + command = 'classify_speakers' + all_begin = time.time() + if not args.temp_directory: + temp_dir = TEMP_DIR + else: + temp_dir = os.path.expanduser(args.temp_directory) + corpus_name = os.path.basename(args.corpus_directory) + if corpus_name == '': + args.corpus_directory = os.path.dirname(args.corpus_directory) + corpus_name = os.path.basename(args.corpus_directory) + data_directory = os.path.join(temp_dir, corpus_name) + conf_path = os.path.join(data_directory, 'config.yml') + logger = setup_logger(command, data_directory) + if args.config_path: + classification_config = classification_yaml_to_config(args.config_path) + else: + classification_config = load_basic_classification() + classification_config.use_mp = not args.disable_mp + if getattr(args, 'clean', False) and os.path.exists(data_directory): + logger.info('Cleaning old directory!') + shutil.rmtree(data_directory, ignore_errors=True) + if os.path.exists(conf_path): + with open(conf_path, 'r') as f: + conf = yaml.load(f, Loader=yaml.SafeLoader) + else: + conf = {'dirty': False, + 'begin': time.time(), + 'version': __version__, + 'type': command, + 'corpus_directory': args.corpus_directory, + 'ivector_extractor_path': args.ivector_extractor_path} + if conf['dirty'] or conf['type'] != command \ + or conf['corpus_directory'] != args.corpus_directory \ + or conf['version'] != __version__: + logger.warning( + 'WARNING: Using old temp directory, this might not be ideal for you, use the --clean flag to ensure no ' + 'weird behavior for previous versions of the temporary directory.') + if conf['dirty']: + logger.debug('Previous run ended in an error (maybe ctrl-c?)') + if conf['type'] != command: + logger.debug('Previous run was a different subcommand than {} (was {})'.format(command, conf['type'])) + if conf['corpus_directory'] != args.corpus_directory: + logger.debug('Previous run used source directory ' + 'path {} (new run: {})'.format(conf['corpus_directory'], args.corpus_directory)) + if conf['version'] != __version__: + logger.debug('Previous run was on {} version (new run: {})'.format(conf['version'], __version__)) + if conf['ivector_extractor_path'] != args.ivector_extractor_path: + logger.debug('Previous run used ivector extractor path {} ' + '(new run: {})'.format(conf['ivector_extractor_path'], args.ivector_extractor_path)) + + os.makedirs(data_directory, exist_ok=True) + os.makedirs(args.output_directory, exist_ok=True) + try: + corpus = TranscribeCorpus(args.corpus_directory, data_directory, + num_jobs=args.num_jobs, logger=logger, use_mp=classification_config.use_mp) + ivector_extractor = IvectorExtractor(args.ivector_extractor_path, root_directory=data_directory) + + begin = time.time() + a = SpeakerClassifier(corpus, ivector_extractor, classification_config, + temp_directory=data_directory, + debug=getattr(args, 'debug', False), logger=logger, num_speakers=args.num_speakers, + cluster=args.cluster) + logger.debug('Setup speaker classifier in {} seconds'.format(time.time() - begin)) + a.verbose = args.verbose + + begin = time.time() + a.classify() + logger.debug('Performed classification in {} seconds'.format(time.time() - begin)) + + begin = time.time() + a.export_classification(args.output_directory) + logger.debug('Exported classification in {} seconds'.format(time.time() - begin)) + logger.info('Done!') + logger.debug('Done! Everything took {} seconds'.format(time.time() - all_begin)) + except Exception as _: + conf['dirty'] = True + raise + finally: + handlers = logger.handlers[:] + for handler in handlers: + handler.close() + logger.removeHandler(handler) + with open(conf_path, 'w') as f: + yaml.dump(conf, f) + + +def validate_args(args, downloaded_ivector_extractors): + if args.cluster and not args.num_speakers: + raise ArgumentError('If using clustering, num_speakers must be specified') + if not os.path.exists(args.corpus_directory): + raise ArgumentError('Could not find the corpus directory {}.'.format(args.corpus_directory)) + if not os.path.isdir(args.corpus_directory): + raise ArgumentError('The specified corpus directory ({}) is not a directory.'.format(args.corpus_directory)) + + if args.corpus_directory == args.output_directory: + raise ArgumentError('Corpus directory and output directory cannot be the same folder.') + + if args.ivector_extractor_path.lower() in downloaded_ivector_extractors: + args.ivector_extractor_path = get_pretrained_ivector_path(args.ivector_extractor_path.lower()) + elif args.ivector_extractor_path.lower().endswith(IvectorExtractor.extension): + if not os.path.exists(args.ivector_extractor_path): + raise ArgumentError('The specified model path does not exist: ' + args.ivector_extractor_path) + else: + raise ArgumentError( + 'The language \'{}\' is not currently included in the distribution, ' + 'please align via training or specify one of the following language names: {}.'.format( + args.ivector_extractor_path.lower(), ', '.join(downloaded_ivector_extractors))) + + +def run_classify_speakers(args, downloaded_ivector_extractors=None): + if downloaded_ivector_extractors is None: + downloaded_ivector_extractors = get_available_ivector_languages() + args.output_directory = args.output_directory.rstrip('/').rstrip('\\') + args.corpus_directory = args.corpus_directory.rstrip('/').rstrip('\\') + + validate_args(args, downloaded_ivector_extractors) + classify_speakers(args) + + +if __name__ == '__main__': # pragma: no cover + mp.freeze_support() + from montreal_forced_aligner.command_line.mfa import classify_speakers_parser, fix_path, unfix_path, ivector_languages + + classify_speakers_args = classify_speakers_parser.parse_args() + fix_path() + run_classify_speakers(classify_speakers_args, ivector_languages) + unfix_path() diff --git a/montreal_forced_aligner/command_line/create_segments.py b/montreal_forced_aligner/command_line/create_segments.py new file mode 100644 index 00000000..317c4c15 --- /dev/null +++ b/montreal_forced_aligner/command_line/create_segments.py @@ -0,0 +1,122 @@ +import shutil +import os +import time +import multiprocessing as mp +import yaml + +from montreal_forced_aligner import __version__ +from montreal_forced_aligner.corpus.transcribe_corpus import TranscribeCorpus +from montreal_forced_aligner.segmenter import Segmenter +from montreal_forced_aligner.config import TEMP_DIR, segmentation_yaml_to_config, load_basic_segmentation +from montreal_forced_aligner.helper import setup_logger +from montreal_forced_aligner.exceptions import ArgumentError + + +def create_segments(args, unknown_args=None): + command = 'create_segments' + all_begin = time.time() + if not args.temp_directory: + temp_dir = TEMP_DIR + else: + temp_dir = os.path.expanduser(args.temp_directory) + corpus_name = os.path.basename(args.corpus_directory) + if corpus_name == '': + args.corpus_directory = os.path.dirname(args.corpus_directory) + corpus_name = os.path.basename(args.corpus_directory) + data_directory = os.path.join(temp_dir, corpus_name) + conf_path = os.path.join(data_directory, 'config.yml') + logger = setup_logger(command, data_directory) + if args.config_path: + segmentation_config = segmentation_yaml_to_config(args.config_path) + else: + segmentation_config = load_basic_segmentation() + if unknown_args: + segmentation_config.update_from_args(unknown_args) + if getattr(args, 'clean', False) and os.path.exists(data_directory): + logger.info('Cleaning old directory!') + shutil.rmtree(data_directory, ignore_errors=True) + if os.path.exists(conf_path): + with open(conf_path, 'r') as f: + conf = yaml.load(f, Loader=yaml.SafeLoader) + else: + conf = {'dirty': False, + 'begin': time.time(), + 'version': __version__, + 'type': command, + 'corpus_directory': args.corpus_directory} + if conf['dirty'] or conf['type'] != command \ + or conf['corpus_directory'] != args.corpus_directory \ + or conf['version'] != __version__: + logger.warning( + 'WARNING: Using old temp directory, this might not be ideal for you, use the --clean flag to ensure no ' + 'weird behavior for previous versions of the temporary directory.') + if conf['dirty']: + logger.debug('Previous run ended in an error (maybe ctrl-c?)') + if conf['type'] != command: + logger.debug('Previous run was a different subcommand than {} (was {})'.format(command, conf['type'])) + if conf['corpus_directory'] != args.corpus_directory: + logger.debug('Previous run used source directory ' + 'path {} (new run: {})'.format(conf['corpus_directory'], args.corpus_directory)) + if conf['version'] != __version__: + logger.debug('Previous run was on {} version (new run: {})'.format(conf['version'], __version__)) + + os.makedirs(data_directory, exist_ok=True) + os.makedirs(args.output_directory, exist_ok=True) + try: + corpus = TranscribeCorpus(args.corpus_directory, data_directory, + num_jobs=args.num_jobs, logger=logger, use_mp=segmentation_config.use_mp) + + begin = time.time() + a = Segmenter(corpus, segmentation_config, + temp_directory=data_directory, + debug=getattr(args, 'debug', False), logger=logger) + logger.debug('Setup segmenter in {} seconds'.format(time.time() - begin)) + a.verbose = args.verbose + + begin = time.time() + a.segment() + logger.debug('Performed segmentation in {} seconds'.format(time.time() - begin)) + + begin = time.time() + a.export_segments(args.output_directory) + logger.debug('Exported segmentation in {} seconds'.format(time.time() - begin)) + logger.info('Done!') + logger.debug('Done! Everything took {} seconds'.format(time.time() - all_begin)) + except Exception as _: + conf['dirty'] = True + raise + finally: + handlers = logger.handlers[:] + for handler in handlers: + handler.close() + logger.removeHandler(handler) + with open(conf_path, 'w') as f: + yaml.dump(conf, f) + + +def validate_args(args): + if not os.path.exists(args.corpus_directory): + raise ArgumentError('Could not find the corpus directory {}.'.format(args.corpus_directory)) + if not os.path.isdir(args.corpus_directory): + raise ArgumentError('The specified corpus directory ({}) is not a directory.'.format(args.corpus_directory)) + + if args.corpus_directory == args.output_directory: + raise ArgumentError('Corpus directory and output directory cannot be the same folder.') + + +def run_create_segments(args, unknown=None): + args.output_directory = args.output_directory.rstrip('/').rstrip('\\') + args.corpus_directory = args.corpus_directory.rstrip('/').rstrip('\\') + + validate_args(args) + create_segments(args, unknown) + + +if __name__ == '__main__': # pragma: no cover + mp.freeze_support() + from montreal_forced_aligner.command_line.mfa import create_segments_parser, fix_path, unfix_path + + create_segments_args, unknown = create_segments_parser.parse_known_args() + fix_path() + run_create_segments(create_segments_args, unknown) + unfix_path() diff --git a/montreal_forced_aligner/command_line/download.py b/montreal_forced_aligner/command_line/download.py index 99b3d2d2..83b4d56e 100644 --- a/montreal_forced_aligner/command_line/download.py +++ b/montreal_forced_aligner/command_line/download.py @@ -1,8 +1,9 @@ import requests from montreal_forced_aligner.exceptions import ArgumentError -from montreal_forced_aligner.models import G2PModel, AcousticModel -from montreal_forced_aligner.utils import get_pretrained_acoustic_path, get_pretrained_g2p_path, get_dictionary_path +from montreal_forced_aligner.models import G2PModel, AcousticModel, IvectorExtractor +from montreal_forced_aligner.utils import get_pretrained_acoustic_path, get_pretrained_g2p_path, get_dictionary_path, \ + get_pretrained_ivector_path def tqdm_hook(t): @@ -41,6 +42,9 @@ def download_model(model_type, language): elif model_type == 'dictionary': extension = '.dict' out_path = get_dictionary_path(language) + elif model_type == 'ivector': + extension = IvectorExtractor.extension + out_path = get_pretrained_ivector_path(language) else: raise NotImplementedError url = 'https://github.com/MontrealCorpusTools/mfa-models/raw/master/{}/{}{}'.format(model_type, language, extension) diff --git a/montreal_forced_aligner/command_line/g2p.py b/montreal_forced_aligner/command_line/g2p.py index 14b45e48..8b9ae8b8 100644 --- a/montreal_forced_aligner/command_line/g2p.py +++ b/montreal_forced_aligner/command_line/g2p.py @@ -30,14 +30,14 @@ def generate_dictionary(args): corpus_name = os.path.basename(args.input_path) data_directory = os.path.join(temp_dir, corpus_name) - corpus = AlignableCorpus(input_dir, data_directory) + corpus = AlignableCorpus(input_dir, data_directory, num_jobs=args.num_jobs, use_mp=(not args.disable_mp)) word_set = get_word_set(corpus, args.include_bracketed) else: - word_set = set() + word_set = [] with open(args.input_path, 'r', encoding='utf8') as f: for line in f: - word_set.update(line.strip().split()) + word_set.extend(line.strip().split()) if not args.include_bracketed: word_set = [x for x in word_set if not check_bracketed(x)] @@ -49,7 +49,7 @@ def generate_dictionary(args): model.clean_up() else: with open(args.output_path, "w", encoding='utf8') as f: - for word in sorted(word_set): + for word in word_set: pronunciation = list(word) f.write('{} {}\n'.format(word, ' '.join(pronunciation))) diff --git a/montreal_forced_aligner/command_line/mfa.py b/montreal_forced_aligner/command_line/mfa.py index ad6b3b01..8a093bce 100644 --- a/montreal_forced_aligner/command_line/mfa.py +++ b/montreal_forced_aligner/command_line/mfa.py @@ -4,7 +4,7 @@ import multiprocessing as mp from montreal_forced_aligner.utils import get_available_acoustic_languages, get_available_g2p_languages, \ - get_available_dict_languages, get_available_lm_languages + get_available_dict_languages, get_available_lm_languages, get_available_ivector_languages from montreal_forced_aligner.command_line.align import run_align_corpus from montreal_forced_aligner.command_line.train_and_align import run_train_corpus from montreal_forced_aligner.command_line.g2p import run_g2p @@ -14,8 +14,10 @@ from montreal_forced_aligner.command_line.train_lm import run_train_lm from montreal_forced_aligner.command_line.thirdparty import run_thirdparty from montreal_forced_aligner.command_line.train_ivector_extractor import run_train_ivector_extractor +from montreal_forced_aligner.command_line.classify_speakers import run_classify_speakers from montreal_forced_aligner.command_line.transcribe import run_transcribe_corpus from montreal_forced_aligner.command_line.train_dictionary import run_train_dictionary +from montreal_forced_aligner.command_line.create_segments import run_create_segments def fix_path(): @@ -40,6 +42,7 @@ def unfix_path(): acoustic_languages = get_available_acoustic_languages() +ivector_languages = get_available_ivector_languages() lm_languages = get_available_lm_languages() g2p_languages = get_available_g2p_languages() dict_languages = get_available_dict_languages() @@ -191,12 +194,13 @@ def unfix_path(): train_dictionary_parser.add_argument('-d', '--debug', help="Output debug messages about alignment", action='store_true') train_ivector_parser = subparsers.add_parser('train_ivector') -train_ivector_parser.add_argument('corpus_directory', help='Full path to the source directory to align') -train_ivector_parser.add_argument('dictionary_path', help='Full path to the pronunciation dictionary to use', - default='') - +train_ivector_parser.add_argument('corpus_directory', help='Full path to the source directory to ' + 'train the ivector extractor') +train_ivector_parser.add_argument('dictionary_path', help='Full path to the pronunciation dictionary to use') +train_ivector_parser.add_argument('acoustic_model_path', type=str, default='', + help='Full path to acoustic model for alignment') train_ivector_parser.add_argument('output_model_path', type=str, default='', - help='Full path to save resulting ivector_extractor') + help='Full path to save resulting ivector extractor') train_ivector_parser.add_argument('-s', '--speaker_characters', type=str, default='0', help='Number of characters of filenames to use for determining speaker, ' 'default is to use directory names') @@ -210,6 +214,48 @@ def unfix_path(): train_ivector_parser.add_argument('--config_path', type=str, default='', help='Path to config file to use for training') +classify_speakers_parser = subparsers.add_parser('classify_speakers') +classify_speakers_parser.add_argument('corpus_directory', help='Full path to the source directory to ' + 'run speaker classification') +classify_speakers_parser.add_argument('ivector_extractor_path', type=str, default='', + help='Full path to ivector extractor model') +classify_speakers_parser.add_argument('output_directory', + help="Full path to output directory, will be created if it doesn't exist") + +classify_speakers_parser.add_argument('-s', '--num_speakers', type=int, default=0, + help='Number of speakers if known') +classify_speakers_parser.add_argument('--cluster', help="Using clustering instead of classification", action='store_true') +classify_speakers_parser.add_argument('-t', '--temp_directory', type=str, default='', + help='Temporary directory root to use for aligning, default is ~/Documents/MFA') +classify_speakers_parser.add_argument('-j', '--num_jobs', type=int, default=3, + help='Number of cores to use while performing speaker classification') +classify_speakers_parser.add_argument('-v', '--verbose', help="Output debug messages about speaker classification", + action='store_true') +classify_speakers_parser.add_argument('-c', '--clean', help="Remove files from previous runs", action='store_true') +classify_speakers_parser.add_argument('-d', '--debug', help="Debug the aligner", action='store_true') +classify_speakers_parser.add_argument('--disable_mp', help="Disable multiprocessing (not recommended)", + action='store_true') +classify_speakers_parser.add_argument('--config_path', type=str, default='', + help='Path to config file to use for ivector extraction') + +create_segments_parser = subparsers.add_parser('create_segments') +create_segments_parser.add_argument('corpus_directory', help='Full path to the source directory to ' + 'run VAD segmentation') +create_segments_parser.add_argument('output_directory', + help="Full path to output directory, will be created if it doesn't exist") + +create_segments_parser.add_argument('-t', '--temp_directory', type=str, default='', + help='Temporary directory root to use for segmentation, default is ' + '~/Documents/MFA') +create_segments_parser.add_argument('-j', '--num_jobs', type=int, default=3, + help='Number of cores to use while creating segments') +create_segments_parser.add_argument('-v', '--verbose', help="Output debug messages about segmentation", + action='store_true') +create_segments_parser.add_argument('-c', '--clean', help="Remove files from previous runs", action='store_true') +create_segments_parser.add_argument('-d', '--debug', help="Debug the aligner", action='store_true') +create_segments_parser.add_argument('--config_path', type=str, default='', + help='Path to config file to use for segmentation') + transcribe_parser = subparsers.add_parser('transcribe') transcribe_parser.add_argument('corpus_directory', help='Full path to the directory to transcribe') transcribe_parser.add_argument('dictionary_path', help='Full path to the pronunciation dictionary to use') @@ -249,8 +295,7 @@ def unfix_path(): def main(): mp.freeze_support() - args = parser.parse_args() - + args, unknown = parser.parse_known_args() fix_path() if args.subcommand in ['align', 'train', 'train_ivector']: from montreal_forced_aligner.thirdparty.kaldi import validate_alignment_binaries @@ -287,7 +332,7 @@ def main(): "please use the Windows Subsystem for Linux to use g2p functionality.") sys.exit(1) if args.subcommand == 'align': - run_align_corpus(args, acoustic_languages) + run_align_corpus(args, unknown, acoustic_languages) elif args.subcommand == 'train': run_train_corpus(args) elif args.subcommand == 'g2p': @@ -304,6 +349,8 @@ def main(): run_train_dictionary(args) elif args.subcommand == 'train_ivector': run_train_ivector_extractor(args) + elif args.subcommand == 'classify_speakers': + run_classify_speakers(args) elif args.subcommand == 'annotator': from montreal_forced_aligner.command_line.annotator import run_annotator run_annotator(args) @@ -311,6 +358,8 @@ def main(): run_thirdparty(args) elif args.subcommand == 'transcribe': run_transcribe_corpus(args) + elif args.subcommand == 'create_segments': + run_create_segments(args, unknown) unfix_path() diff --git a/montreal_forced_aligner/command_line/train_and_align.py b/montreal_forced_aligner/command_line/train_and_align.py index f276e8e6..8a748388 100644 --- a/montreal_forced_aligner/command_line/train_and_align.py +++ b/montreal_forced_aligner/command_line/train_and_align.py @@ -10,11 +10,14 @@ from montreal_forced_aligner.aligner import TrainableAligner from montreal_forced_aligner.config import TEMP_DIR, train_yaml_to_config, load_basic_train from montreal_forced_aligner.utils import get_available_dict_languages, get_dictionary_path +from montreal_forced_aligner.helper import setup_logger from montreal_forced_aligner.exceptions import ArgumentError -def align_corpus(args): +def align_corpus(args, unknown_args=None): + command = 'train_and_align' + all_begin = time.time() if not args.temp_directory: temp_dir = TEMP_DIR else: @@ -24,7 +27,19 @@ def align_corpus(args): args.corpus_directory = os.path.dirname(args.corpus_directory) corpus_name = os.path.basename(args.corpus_directory) data_directory = os.path.join(temp_dir, corpus_name) + logger = setup_logger(command, data_directory) + if args.config_path: + train_config, align_config = train_yaml_to_config(args.config_path) + else: + train_config, align_config = load_basic_train() + if unknown_args: + align_config.update_from_args(unknown_args) conf_path = os.path.join(data_directory, 'config.yml') + if args.debug: + logger.warning('Running in DEBUG mode, may have impact on performance and disk usage.') + if getattr(args, 'clean', False) and os.path.exists(data_directory): + logger.info('Cleaning old directory!') + shutil.rmtree(data_directory, ignore_errors=True) if os.path.exists(conf_path): with open(conf_path, 'r') as f: conf = yaml.load(f, Loader=yaml.SafeLoader) @@ -32,47 +47,66 @@ def align_corpus(args): conf = {'dirty': False, 'begin': time.time(), 'version': __version__, - 'type': 'train_and_align', + 'type': command, 'corpus_directory': args.corpus_directory, 'dictionary_path': args.dictionary_path} - if getattr(args, 'clean', False) \ - or conf['dirty'] or conf['type'] != 'train_and_align' \ + if conf['dirty'] or conf['type'] != command \ or conf['corpus_directory'] != args.corpus_directory \ or conf['version'] != __version__ \ or conf['dictionary_path'] != args.dictionary_path: - shutil.rmtree(data_directory, ignore_errors=True) + logger.warning( + 'WARNING: Using old temp directory, this might not be ideal for you, use the --clean flag to ensure no ' + 'weird behavior for previous versions of the temporary directory.') + if conf['dirty']: + logger.debug('Previous run ended in an error (maybe ctrl-c?)') + if conf['type'] != command: + logger.debug('Previous run was a different subcommand than {} (was {})'.format(command, conf['type'])) + if conf['corpus_directory'] != args.corpus_directory: + logger.debug('Previous run used source directory ' + 'path {} (new run: {})'.format(conf['corpus_directory'], args.corpus_directory)) + if conf['version'] != __version__: + logger.debug('Previous run was on {} version (new run: {})'.format(conf['version'], __version__)) + if conf['dictionary_path'] != args.dictionary_path: + logger.debug('Previous run used dictionary path {} ' + '(new run: {})'.format(conf['dictionary_path'], args.dictionary_path)) os.makedirs(data_directory, exist_ok=True) os.makedirs(args.output_directory, exist_ok=True) try: corpus = AlignableCorpus(args.corpus_directory, data_directory, speaker_characters=args.speaker_characters, num_jobs=getattr(args, 'num_jobs', 3), - debug=getattr(args, 'debug', False)) + debug=getattr(args, 'debug', False), logger=logger, use_mp=align_config.use_mp) if corpus.issues_check: - print('WARNING: Some issues parsing the corpus were detected. ' - 'Please run the validator to get more information.') - dictionary = Dictionary(args.dictionary_path, data_directory, word_set=corpus.word_set) + logger.warning('Some issues parsing the corpus were detected. ' + 'Please run the validator to get more information.') + logger.info(corpus.speaker_utterance_info()) + dictionary = Dictionary(args.dictionary_path, data_directory, word_set=corpus.word_set, logger=logger) utt_oov_path = os.path.join(corpus.split_directory(), 'utterance_oovs.txt') if os.path.exists(utt_oov_path): shutil.copy(utt_oov_path, args.output_directory) oov_path = os.path.join(corpus.split_directory(), 'oovs_found.txt') if os.path.exists(oov_path): shutil.copy(oov_path, args.output_directory) - if args.config_path: - train_config, align_config = train_yaml_to_config(args.config_path) - else: - train_config, align_config = load_basic_train() a = TrainableAligner(corpus, dictionary, train_config, align_config, - temp_directory=data_directory) + temp_directory=data_directory, logger=logger, + debug=getattr(args, 'debug', False)) a.verbose = args.verbose + begin = time.time() a.train() + logger.debug('Training took {} seconds'.format(time.time() - begin)) a.export_textgrids(args.output_directory) if args.output_model_path is not None: a.save(args.output_model_path) + logger.info('All done!') + logger.debug('Done! Everything took {} seconds'.format(time.time() - all_begin)) except Exception as _: conf['dirty'] = True raise finally: + handlers = logger.handlers[:] + for handler in handlers: + handler.close() + logger.removeHandler(handler) with open(conf_path, 'w') as f: yaml.dump(conf, f) @@ -93,7 +127,7 @@ def validate_args(args, download_dictionaries): raise (ArgumentError('The specified dictionary path ({}) is not a text file.'.format(args.dictionary_path))) -def run_train_corpus(args, download_dictionaries=None): +def run_train_corpus(args, unknown_args=None, download_dictionaries=None): if download_dictionaries is None: download_dictionaries = get_available_dict_languages() try: @@ -106,7 +140,7 @@ def run_train_corpus(args, download_dictionaries=None): args.corpus_directory = args.corpus_directory.rstrip('/').rstrip('\\') validate_args(args, download_dictionaries) - align_corpus(args) + align_corpus(args, unknown_args) if __name__ == '__main__': # pragma: no cover diff --git a/montreal_forced_aligner/command_line/train_dictionary.py b/montreal_forced_aligner/command_line/train_dictionary.py index 7ab8737e..2cc4f2b2 100644 --- a/montreal_forced_aligner/command_line/train_dictionary.py +++ b/montreal_forced_aligner/command_line/train_dictionary.py @@ -27,6 +27,10 @@ def train_dictionary(args): corpus_name = os.path.basename(args.corpus_directory) data_directory = os.path.join(temp_dir, corpus_name) conf_path = os.path.join(data_directory, 'config.yml') + if args.config_path: + align_config = align_yaml_to_config(args.config_path) + else: + align_config = load_basic_align() if os.path.exists(conf_path): with open(conf_path, 'r') as f: conf = yaml.load(f, Loader=yaml.SafeLoader) @@ -48,7 +52,7 @@ def train_dictionary(args): try: corpus = AlignableCorpus(args.corpus_directory, data_directory, speaker_characters=args.speaker_characters, - num_jobs=args.num_jobs) + num_jobs=args.num_jobs, use_mp=align_config.use_mp) if corpus.issues_check: print('WARNING: Some issues parsing the corpus were detected. ' 'Please run the validator to get more information.') @@ -58,10 +62,6 @@ def train_dictionary(args): acoustic_model.validate(dictionary) begin = time.time() - if args.config_path: - align_config = align_yaml_to_config(args.config_path) - else: - align_config = load_basic_align() a = PretrainedAligner(corpus, dictionary, acoustic_model, align_config, temp_directory=data_directory, debug=getattr(args, 'debug', False)) diff --git a/montreal_forced_aligner/command_line/train_ivector_extractor.py b/montreal_forced_aligner/command_line/train_ivector_extractor.py index 16d22882..e9f6287e 100644 --- a/montreal_forced_aligner/command_line/train_ivector_extractor.py +++ b/montreal_forced_aligner/command_line/train_ivector_extractor.py @@ -5,16 +5,21 @@ import time from montreal_forced_aligner import __version__ -from montreal_forced_aligner.corpus.align_corpus import AlignableCorpus +from montreal_forced_aligner.corpus import AlignableCorpus from montreal_forced_aligner.dictionary import Dictionary -from montreal_forced_aligner.aligner import TrainableAligner +from montreal_forced_aligner.aligner import PretrainedAligner from montreal_forced_aligner.config import TEMP_DIR, train_yaml_to_config, load_basic_train_ivector -from montreal_forced_aligner.utils import get_available_dict_languages, get_dictionary_path +from montreal_forced_aligner.helper import setup_logger +from montreal_forced_aligner.utils import get_available_acoustic_languages, get_pretrained_acoustic_path, \ + get_available_dict_languages, get_dictionary_path +from montreal_forced_aligner.models import AcousticModel from montreal_forced_aligner.exceptions import ArgumentError def train_ivector(args): + command = 'train_ivector' + all_begin = time.time() if not args.temp_directory: temp_dir = TEMP_DIR else: @@ -24,57 +29,94 @@ def train_ivector(args): args.corpus_directory = os.path.dirname(args.corpus_directory) corpus_name = os.path.basename(args.corpus_directory) data_directory = os.path.join(temp_dir, corpus_name) + logger = setup_logger(command, data_directory) + if args.config_path: + train_config, align_config = train_yaml_to_config(args.config_path) + else: + train_config, align_config = load_basic_train_ivector() conf_path = os.path.join(data_directory, 'config.yml') + if getattr(args, 'clean', False) and os.path.exists(data_directory): + logger.info('Cleaning old directory!') + shutil.rmtree(data_directory, ignore_errors=True) + if os.path.exists(conf_path): with open(conf_path, 'r') as f: conf = yaml.load(f, Loader=yaml.SafeLoader) else: conf = {'dirty': False, - 'begin': time.time(), + 'begin': all_begin, 'version': __version__, - 'type': 'train_and_align', + 'type': command, 'corpus_directory': args.corpus_directory, - 'dictionary_path': args.dictionary_path} - if getattr(args, 'clean', False) \ - or conf['dirty'] or conf['type'] != 'train_and_align' \ + 'dictionary_path': args.dictionary_path, + 'acoustic_model_path': args.acoustic_model_path, + } + if conf['dirty'] or conf['type'] != command \ or conf['corpus_directory'] != args.corpus_directory \ or conf['version'] != __version__ \ - or conf['dictionary_path'] != args.dictionary_path: - shutil.rmtree(data_directory, ignore_errors=True) + or conf['dictionary_path'] != args.dictionary_path \ + or conf['acoustic_model_path'] != args.acoustic_model_path: + logger.warning( + 'WARNING: Using old temp directory, this might not be ideal for you, use the --clean flag to ensure no ' + 'weird behavior for previous versions of the temporary directory.') + if conf['dirty']: + logger.debug('Previous run ended in an error (maybe ctrl-c?)') + if conf['type'] != command: + logger.debug('Previous run was a different subcommand than {} (was {})'.format(command, conf['type'])) + if conf['corpus_directory'] != args.corpus_directory: + logger.debug('Previous run used source directory ' + 'path {} (new run: {})'.format(conf['corpus_directory'], args.corpus_directory)) + if conf['version'] != __version__: + logger.debug('Previous run was on {} version (new run: {})'.format(conf['version'], __version__)) + if conf['dictionary_path'] != args.dictionary_path: + logger.debug('Previous run used dictionary path {} ' + '(new run: {})'.format(conf['dictionary_path'], args.dictionary_path)) + if conf['acoustic_model_path'] != args.acoustic_model_path: + logger.debug('Previous run used acoustic model path {} ' + '(new run: {})'.format(conf['acoustic_model_path'], args.acoustic_model_path)) os.makedirs(data_directory, exist_ok=True) try: - corpus = AlignableCorpus(args.corpus_directory, data_directory, speaker_characters=args.speaker_characters, - num_jobs=getattr(args, 'num_jobs', 3), - debug=getattr(args, 'debug', False)) - if corpus.issues_check: - print('WARNING: Some issues parsing the corpus were detected. ' - 'Please run the validator to get more information.') - dictionary = Dictionary(args.dictionary_path, data_directory, word_set=corpus.word_set) - utt_oov_path = os.path.join(corpus.split_directory(), 'utterance_oovs.txt') - if os.path.exists(utt_oov_path): - shutil.copy(utt_oov_path, args.output_directory) - oov_path = os.path.join(corpus.split_directory(), 'oovs_found.txt') - if os.path.exists(oov_path): - shutil.copy(oov_path, args.output_directory) - if args.config_path: - train_config, align_config = train_yaml_to_config(args.config_path) - else: - train_config, align_config = load_basic_train_ivector() - a = TrainableAligner(corpus, dictionary, train_config, align_config, - temp_directory=data_directory) + begin = time.time() + corpus = AlignableCorpus(args.corpus_directory, data_directory, + speaker_characters=args.speaker_characters, + num_jobs=args.num_jobs, + debug=getattr(args, 'debug', False), logger=logger, use_mp=align_config.use_mp) + acoustic_model = AcousticModel(args.acoustic_model_path) + dictionary = Dictionary(args.dictionary_path, data_directory, word_set=corpus.word_set, logger=logger) + acoustic_model.validate(dictionary) + a = PretrainedAligner(corpus, dictionary, acoustic_model, align_config, + temp_directory=data_directory, logger=logger) + logger.debug('Setup pretrained aligner in {} seconds'.format(time.time() - begin)) a.verbose = args.verbose - a.train() - a.save(args.output_model_path) + begin = time.time() + a.align() + logger.debug('Performed alignment in {} seconds'.format(time.time() - begin)) + for identifier, trainer in train_config.items(): + trainer.logger = logger + if identifier != 'ivector': + continue + begin = time.time() + trainer.init_training(identifier, data_directory, corpus, dictionary, a) + trainer.train(call_back=print) + logger.debug('Training took {} seconds'.format(time.time() - begin)) + trainer.save(args.output_model_path) + + logger.info('All done!') + logger.debug('Done! Everything took {} seconds'.format(time.time() - all_begin)) except Exception as e: conf['dirty'] = True raise e finally: + handlers = logger.handlers[:] + for handler in handlers: + handler.close() + logger.removeHandler(handler) with open(conf_path, 'w') as f: yaml.dump(conf, f) -def validate_args(args, download_dictionaries=None): +def validate_args(args, downloaded_acoustic_models, download_dictionaries): if args.config_path and not os.path.exists(args.config_path): raise (ArgumentError('Could not find the config file {}.'.format(args.config_path))) @@ -86,12 +128,25 @@ def validate_args(args, download_dictionaries=None): if args.dictionary_path.lower() in download_dictionaries: args.dictionary_path = get_dictionary_path(args.dictionary_path.lower()) if not os.path.exists(args.dictionary_path): - raise (ArgumentError('Could not find the dictionary file {}'.format(args.dictionary_path))) + raise ArgumentError('Could not find the dictionary file {}'.format(args.dictionary_path)) if not os.path.isfile(args.dictionary_path): - raise (ArgumentError('The specified dictionary path ({}) is not a text file.'.format(args.dictionary_path))) + raise ArgumentError('The specified dictionary path ({}) is not a text file.'.format(args.dictionary_path)) + + if args.acoustic_model_path.lower() in downloaded_acoustic_models: + args.acoustic_model_path = get_pretrained_acoustic_path(args.acoustic_model_path.lower()) + elif args.acoustic_model_path.lower().endswith(AcousticModel.extension): + if not os.path.exists(args.acoustic_model_path): + raise ArgumentError('The specified model path does not exist: ' + args.acoustic_model_path) + else: + raise ArgumentError( + 'The language \'{}\' is not currently included in the distribution, ' + 'please align via training or specify one of the following language names: {}.'.format( + args.acoustic_model_path.lower(), ', '.join(downloaded_acoustic_models))) -def run_train_ivector_extractor(args, download_dictionaries=None): +def run_train_ivector_extractor(args, downloaded_acoustic_models=None, download_dictionaries=None): + if downloaded_acoustic_models is None: + downloaded_acoustic_models = get_available_acoustic_languages() if download_dictionaries is None: download_dictionaries = get_available_dict_languages() try: @@ -100,15 +155,17 @@ def run_train_ivector_extractor(args, download_dictionaries=None): pass args.corpus_directory = args.corpus_directory.rstrip('/').rstrip('\\') - validate_args(args, download_dictionaries) + validate_args(args, downloaded_acoustic_models, download_dictionaries) train_ivector(args) if __name__ == '__main__': # pragma: no cover mp.freeze_support() - from montreal_forced_aligner.command_line.mfa import train_ivector_parser, fix_path, unfix_path, dict_languages + from montreal_forced_aligner.command_line.mfa import train_ivector_parser, fix_path, unfix_path, acoustic_languages, \ + dict_languages + ivector_args = train_ivector_parser.parse_args() fix_path() - run_train_ivector_extractor(ivector_args, dict_languages) + run_train_ivector_extractor(ivector_args, acoustic_languages, dict_languages) unfix_path() diff --git a/montreal_forced_aligner/command_line/train_lm.py b/montreal_forced_aligner/command_line/train_lm.py index 84a64604..a5044846 100644 --- a/montreal_forced_aligner/command_line/train_lm.py +++ b/montreal_forced_aligner/command_line/train_lm.py @@ -11,40 +11,52 @@ from montreal_forced_aligner.lm.trainer import LmTrainer from montreal_forced_aligner.utils import get_available_dict_languages, get_dictionary_path +from montreal_forced_aligner.helper import setup_logger def train_lm(args): + command = 'train_lm' + all_begin = time.time() if not args.temp_directory: temp_dir = TEMP_DIR else: temp_dir = os.path.expanduser(args.temp_directory) - all_begin = time.time() + if args.config_path: + train_config = train_lm_yaml_to_config(args.config_path) + else: + train_config = load_basic_train_lm() corpus_name = os.path.basename(args.source_path) if corpus_name == '': args.source_path = os.path.dirname(args.source_path) corpus_name = os.path.basename(args.source_path) + source = args.source_path + dictionary = None if args.source_path.lower().endswith('.arpa'): - source = args.source_path - dictionary = None corpus_name = os.path.splitext(corpus_name)[0] data_directory = os.path.join(temp_dir, corpus_name) else: data_directory = os.path.join(temp_dir, corpus_name) - source = AlignableCorpus(args.source_path, data_directory, num_jobs=args.num_jobs) + + logger = setup_logger(command, data_directory) + if not args.source_path.lower().endswith('.arpa'): + source = AlignableCorpus(args.source_path, data_directory, num_jobs=args.num_jobs, use_mp=args.num_jobs>1) if args.dictionary_path is not None: dictionary = Dictionary(args.dictionary_path, data_directory) else: dictionary = None - if args.config_path: - train_config = train_lm_yaml_to_config(args.config_path) - else: - train_config = load_basic_train_lm() trainer = LmTrainer(source, train_config, args.output_model_path, dictionary=dictionary, temp_directory=data_directory, supplemental_model_path=args.model_path, supplemental_model_weight=args.model_weight) + begin = time.time() trainer.train() + logger.debug('Training took {} seconds'.format(time.time() - begin)) - print('Done! Everything took {} seconds'.format(time.time() - all_begin)) + logger.info('All done!') + logger.debug('Done! Everything took {} seconds'.format(time.time() - all_begin)) + handlers = logger.handlers[:] + for handler in handlers: + handler.close() + logger.removeHandler(handler) def validate_args(args, download_dictionaries=None): diff --git a/montreal_forced_aligner/command_line/transcribe.py b/montreal_forced_aligner/command_line/transcribe.py index 92a09844..28b784b1 100644 --- a/montreal_forced_aligner/command_line/transcribe.py +++ b/montreal_forced_aligner/command_line/transcribe.py @@ -8,7 +8,8 @@ from montreal_forced_aligner.corpus import AlignableCorpus, TranscribeCorpus from montreal_forced_aligner.dictionary import Dictionary from montreal_forced_aligner.transcriber import Transcriber -from montreal_forced_aligner.models import AcousticModel, LanguageModel +from montreal_forced_aligner.models import AcousticModel, LanguageModel, FORMAT +from montreal_forced_aligner.helper import setup_logger from montreal_forced_aligner.config import TEMP_DIR, transcribe_yaml_to_config, load_basic_transcribe, save_config from montreal_forced_aligner.utils import get_available_acoustic_languages, get_pretrained_acoustic_path, \ get_available_lm_languages, get_pretrained_language_model_path, \ @@ -16,22 +17,8 @@ from montreal_forced_aligner.exceptions import ArgumentError -class DummyArgs(object): - def __init__(self): - self.corpus_directory = '' - self.dictionary_path = '' - self.acoustic_model_path = '' - self.speaker_characters = 0 - self.num_jobs = 0 - self.verbose = False - self.clean = True - self.fast = True - self.debug = False - self.temp_directory = None - self.config_path = '' - - def transcribe_corpus(args): + command = 'transcribe' all_begin = time.time() if not args.temp_directory: temp_dir = TEMP_DIR @@ -41,8 +28,15 @@ def transcribe_corpus(args): if corpus_name == '': args.corpus_directory = os.path.dirname(args.corpus_directory) corpus_name = os.path.basename(args.corpus_directory) + if args.config_path: + transcribe_config = transcribe_yaml_to_config(args.config_path) + else: + transcribe_config = load_basic_transcribe() data_directory = os.path.join(temp_dir, corpus_name) - print(data_directory, os.path.exists(data_directory)) + logger = setup_logger(command, data_directory) + if getattr(args, 'clean', False) and os.path.exists(data_directory): + logger.info('Cleaning old directory!') + shutil.rmtree(data_directory, ignore_errors=True) os.makedirs(data_directory, exist_ok=True) os.makedirs(args.output_directory, exist_ok=True) os.makedirs(data_directory, exist_ok=True) @@ -54,35 +48,53 @@ def transcribe_corpus(args): conf = {'dirty': False, 'begin': time.time(), 'version': __version__, - 'type': 'align', + 'type': 'transcribe', 'corpus_directory': args.corpus_directory, - 'dictionary_path': args.dictionary_path} - if getattr(args, 'clean', False) \ - or conf['dirty'] or conf['type'] != 'align' \ + 'dictionary_path': args.dictionary_path, + 'acoustic_model_path': args.acoustic_model_path, + 'language_model_path': args.language_model_path, + } + if conf['dirty'] or conf['type'] != command \ or conf['corpus_directory'] != args.corpus_directory \ or conf['version'] != __version__ \ - or conf['dictionary_path'] != args.dictionary_path: - pass # FIXME - # shutil.rmtree(data_directory, ignore_errors=True) + or conf['dictionary_path'] != args.dictionary_path \ + or conf['language_model_path'] != args.language_model_path \ + or conf['acoustic_model_path'] != args.acoustic_model_path: + logger.warning( + 'WARNING: Using old temp directory, this might not be ideal for you, use the --clean flag to ensure no ' + 'weird behavior for previous versions of the temporary directory.') + if conf['dirty']: + logger.debug('Previous run ended in an error (maybe ctrl-c?)') + if conf['type'] != command: + logger.debug('Previous run was a different subcommand than {} (was {})'.format(command, conf['type'])) + if conf['corpus_directory'] != args.corpus_directory: + logger.debug('Previous run used source directory ' + 'path {} (new run: {})'.format(conf['corpus_directory'], args.corpus_directory)) + if conf['version'] != __version__: + logger.debug('Previous run was on {} version (new run: {})'.format(conf['version'], __version__)) + if conf['dictionary_path'] != args.dictionary_path: + logger.debug('Previous run used dictionary path {} ' + '(new run: {})'.format(conf['dictionary_path'], args.dictionary_path)) + if conf['acoustic_model_path'] != args.acoustic_model_path: + logger.debug('Previous run used acoustic model path {} ' + '(new run: {})'.format(conf['acoustic_model_path'], args.acoustic_model_path)) + if conf['language_model_path'] != args.language_model_path: + logger.debug('Previous run used language model path {} ' + '(new run: {})'.format(conf['language_model_path'], args.language_model_path)) try: if args.evaluate: corpus = AlignableCorpus(args.corpus_directory, data_directory, speaker_characters=args.speaker_characters, - num_jobs=args.num_jobs) + num_jobs=args.num_jobs, use_mp=transcribe_config.use_mp) else: corpus = TranscribeCorpus(args.corpus_directory, data_directory, speaker_characters=args.speaker_characters, - num_jobs=args.num_jobs) + num_jobs=args.num_jobs, use_mp=transcribe_config.use_mp) print(corpus.speaker_utterance_info()) - acoustic_model = AcousticModel(args.acoustic_model_path) - language_model = LanguageModel(args.language_model_path) + acoustic_model = AcousticModel(args.acoustic_model_path, root_directory=data_directory) + language_model = LanguageModel(args.language_model_path, root_directory=data_directory) dictionary = Dictionary(args.dictionary_path, data_directory) acoustic_model.validate(dictionary) - - if args.config_path: - transcribe_config = transcribe_yaml_to_config(args.config_path) - else: - transcribe_config = load_basic_transcribe() begin = time.time() t = Transcriber(corpus, dictionary, acoustic_model, language_model, transcribe_config, temp_directory=data_directory, @@ -109,6 +121,10 @@ def transcribe_corpus(args): conf['dirty'] = True raise finally: + handlers = logger.handlers[:] + for handler in handlers: + handler.close() + logger.removeHandler(handler) if os.path.exists(data_directory): with open(conf_path, 'w') as f: yaml.dump(conf, f) @@ -142,7 +158,8 @@ def validate_args(args, downloaded_acoustic_models, download_dictionaries, down if args.language_model_path.lower() in downloaded_language_models: args.language_model_path = get_pretrained_language_model_path(args.language_model_path.lower()) - elif args.language_model_path.lower().endswith(LanguageModel.extension): + elif args.language_model_path.lower().endswith(LanguageModel.extension) or \ + args.language_model_path.lower().endswith(FORMAT): if not os.path.exists(args.language_model_path): raise ArgumentError('The specified model path does not exist: ' + args.language_model_path) else: @@ -152,7 +169,8 @@ def validate_args(args, downloaded_acoustic_models, download_dictionaries, down args.language_model_path.lower(), ', '.join(downloaded_language_models))) -def run_transcribe_corpus(args, downloaded_acoustic_models=None, download_dictionaries=None, downloaded_language_models=None): +def run_transcribe_corpus(args, downloaded_acoustic_models=None, download_dictionaries=None, + downloaded_language_models=None): if downloaded_acoustic_models is None: downloaded_acoustic_models = get_available_acoustic_languages() if download_dictionaries is None: diff --git a/montreal_forced_aligner/command_line/validate.py b/montreal_forced_aligner/command_line/validate.py index 08b7bdd9..5180cdc5 100644 --- a/montreal_forced_aligner/command_line/validate.py +++ b/montreal_forced_aligner/command_line/validate.py @@ -1,5 +1,6 @@ import shutil import os +import time import multiprocessing as mp from montreal_forced_aligner.corpus.align_corpus import AlignableCorpus @@ -7,10 +8,15 @@ from montreal_forced_aligner.validator import CorpusValidator from montreal_forced_aligner.exceptions import ArgumentError from montreal_forced_aligner.config import TEMP_DIR -from montreal_forced_aligner.utils import get_available_dict_languages, get_dictionary_path +from montreal_forced_aligner.utils import get_available_acoustic_languages, get_pretrained_acoustic_path, \ + get_available_dict_languages, get_dictionary_path +from montreal_forced_aligner.helper import setup_logger +from montreal_forced_aligner.models import AcousticModel def validate_corpus(args): + command = 'validate' + all_begin = time.time() if not args.temp_directory: temp_dir = TEMP_DIR else: @@ -23,18 +29,31 @@ def validate_corpus(args): shutil.rmtree(data_directory, ignore_errors=True) os.makedirs(data_directory, exist_ok=True) + logger = setup_logger(command, data_directory) corpus = AlignableCorpus(args.corpus_directory, data_directory, speaker_characters=args.speaker_characters, - num_jobs=getattr(args, 'num_jobs', 3)) - dictionary = Dictionary(args.dictionary_path, data_directory, word_set=corpus.word_set) + num_jobs=getattr(args, 'num_jobs', 3), logger=logger, use_mp=not args.disable_mp) + dictionary = Dictionary(args.dictionary_path, data_directory, logger=logger) + if args.acoustic_model_path: + acoustic_model = AcousticModel(args.acoustic_model_path) + acoustic_model.validate(dictionary) a = CorpusValidator(corpus, dictionary, temp_directory=data_directory, ignore_acoustics=getattr(args, 'ignore_acoustics', False), - test_transcriptions=getattr(args, 'test_transcriptions', False), use_mp=not args.disable_mp) + test_transcriptions=getattr(args, 'test_transcriptions', False), use_mp=not args.disable_mp, + logger=logger) + begin = time.time() a.validate() + logger.debug('Validation took {} seconds'.format(time.time() - begin)) + logger.info('All done!') + logger.debug('Done! Everything took {} seconds'.format(time.time() - all_begin)) + handlers = logger.handlers[:] + for handler in handlers: + handler.close() + logger.removeHandler(handler) -def validate_args(args, download_dictionaries=None): +def validate_args(args, downloaded_acoustic_models=None, download_dictionaries=None): if args.test_transcriptions and args.ignore_acoustics: raise ArgumentError('Cannot test transcriptions without acoustic feature generation.') if not os.path.exists(args.corpus_directory): @@ -49,23 +68,38 @@ def validate_args(args, download_dictionaries=None): if not os.path.isfile(args.dictionary_path): raise (ArgumentError('The specified dictionary path ({}) is not a text file.'.format(args.dictionary_path))) + if args.acoustic_model_path: + if args.acoustic_model_path.lower() in downloaded_acoustic_models: + args.acoustic_model_path = get_pretrained_acoustic_path(args.acoustic_model_path.lower()) + elif args.acoustic_model_path.lower().endswith(AcousticModel.extension): + if not os.path.exists(args.acoustic_model_path): + raise ArgumentError('The specified model path does not exist: ' + args.acoustic_model_path) + else: + raise ArgumentError( + 'The language \'{}\' is not currently included in the distribution, ' + 'please align via training or specify one of the following language names: {}.'.format( + args.acoustic_model_path.lower(), ', '.join(downloaded_acoustic_models))) -def run_validate_corpus(args, download_dictionaries=None): + +def run_validate_corpus(args, downloaded_acoustic_models=None, download_dictionaries=None): + if downloaded_acoustic_models is None: + downloaded_acoustic_models = get_available_acoustic_languages() if download_dictionaries is None: download_dictionaries = get_available_dict_languages() try: args.speaker_characters = int(args.speaker_characters) except ValueError: pass - validate_args(args, download_dictionaries) + validate_args(args, downloaded_acoustic_models, download_dictionaries) validate_corpus(args) if __name__ == '__main__': # pragma: no cover mp.freeze_support() - from montreal_forced_aligner.command_line.mfa import validate_parser, fix_path, unfix_path, dict_languages + from montreal_forced_aligner.command_line.mfa import validate_parser, fix_path, unfix_path, acoustic_languages, \ + dict_languages validate_args = validate_parser.parse_args() fix_path() - run_validate_corpus(validate_args, dict_languages) + run_validate_corpus(validate_args, acoustic_languages, dict_languages) unfix_path() diff --git a/montreal_forced_aligner/config/__init__.py b/montreal_forced_aligner/config/__init__.py index eab54bdc..c88dcbbe 100644 --- a/montreal_forced_aligner/config/__init__.py +++ b/montreal_forced_aligner/config/__init__.py @@ -1,278 +1,17 @@ import os -from collections import Counter -import yaml - -from ..exceptions import ConfigError - -from ..trainers import MonophoneTrainer, TriphoneTrainer, LdaTrainer, SatTrainer, IvectorExtractorTrainer - -from ..features.config import FeatureConfig - +from .base_config import BaseConfig, save_config, ConfigError +from .align_config import AlignConfig, load_basic_align, align_yaml_to_config, FeatureConfig +from .speaker_classification_config import SpeakerClassificationConfig, classification_yaml_to_config, load_basic_classification +from .train_config import TrainingConfig, load_basic_train, load_basic_train_ivector, load_test_config, \ + train_yaml_to_config +from .train_lm_config import TrainLMConfig, load_basic_train_lm, train_lm_yaml_to_config +from .transcribe_config import TranscribeConfig, load_basic_transcribe, transcribe_yaml_to_config +from .segmentation_config import SegmentationConfig, segmentation_yaml_to_config, load_basic_segmentation TEMP_DIR = os.path.expanduser('~/Documents/MFA') -class BaseConfig(object): - def update(self, data): - for k, v in data.items(): - if not hasattr(self, k): - raise ConfigError('No field found for key {}'.format(k)) - setattr(self, k, v) - - -class TrainingConfig(BaseConfig): - def __init__(self, training_configs): - self.training_configs = training_configs - counts = Counter([x.train_type for x in self.training_configs]) - self.training_identifiers = [] - curs = {x.train_type: 1 for x in self.training_configs} - for t in training_configs: - i = t.train_type - if counts[t.train_type] != 1: - i += str(curs[t.train_type]) - curs[t.train_type] += 1 - self.training_identifiers.append(i) - - def keys(self): - return self.training_identifiers - - def values(self): - return self.training_configs - - def items(self): - return zip(self.training_identifiers, self.training_configs) - - def __getitem__(self, item): - if item not in self.training_identifiers: - raise KeyError('{} not a valid training identifier'.format(item)) - return self.training_configs[self.training_identifiers.index(item)] - - @property - def uses_lda(self): - for k in self.keys(): - if k.startswith('lda'): - return True - return False - - @property - def uses_sat(self): - for k in self.keys(): - if k.startswith('sat'): - return True - return False - - -class AlignConfig(BaseConfig): - def __init__(self, feature_config): - self.transition_scale = 1.0 - self.acoustic_scale = 0.1 - self.self_loop_scale = 0.1 - self.feature_config = feature_config - self.boost_silence = 1.0 - self.beam = 10 - self.retry_beam = 40 - self.data_directory = None # Gets set later - self.use_mp = True - - @property - def feature_file_base_name(self): - return self.feature_config.feature_id - - def update(self, data): - for k, v in data.items(): - if k == 'use_mp': - self.feature_config.use_mp = v - if not hasattr(self, k): - raise ConfigError('No field found for key {}'.format(k)) - setattr(self, k, v) - - -class TranscribeConfig(BaseConfig): - def __init__(self, feature_config): - self.transition_scale = 1.0 - self.acoustic_scale = 0.083333 - self.self_loop_scale = 0.1 - self.feature_config = feature_config - self.silence_weight = 0.01 - self.beam = 10 - self.max_active = 7000 - self.fmllr = False - self.fmllr_update_type = 'full' - self.lattice_beam = 6 - self.first_beam = None - self.first_max_active = None - self.max_fmllr_jobs = 12 - self.language_model_weight = 10 - self.word_insertion_penalty = 0.5 - self.data_directory = None # Gets set later - self.use_mp = True - - def params(self): - return { - 'transition_scale': self.transition_scale, - 'acoustic_scale': self.acoustic_scale, - 'self_loop_scale': self.self_loop_scale, - 'silence_weight': self.silence_weight, - 'beam': self.beam, - 'max_active': self.max_active, - 'fmllr': self.fmllr, - 'fmllr_update_type': self.fmllr_update_type, - 'lattice_beam': self.lattice_beam, - 'first_beam': self.first_beam, - 'first_max_active': self.first_max_active, - 'max_fmllr_jobs': self.max_fmllr_jobs, - 'language_model_weight': self.language_model_weight, - 'word_insertion_penalty': self.word_insertion_penalty, - 'use_mp': self.use_mp, - } - - @property - def feature_file_base_name(self): - return self.feature_config.feature_id - - def update(self, data): - for k, v in data.items(): - if k == 'use_mp': - self.feature_config.use_mp = v - if not hasattr(self, k): - raise ConfigError('No field found for key {}'.format(k)) - setattr(self, k, v) - - -class TrainLMConfig(BaseConfig): - def __init__(self): - self.order = 3 - self.method = 'kneser_ney' - self.prune = False - self.count_threshold = 1 - self.prune_thresh_small = 0.0000003 - self.prune_thresh_medium = 0.0000001 - - -def train_yaml_to_config(path): - with open(path, 'r', encoding='utf8') as f: - data = yaml.load(f, Loader=yaml.SafeLoader) - global_params = {} - training = [] - training_params = [] - global_feature_params = {} - for k, v in data.items(): - if k == 'training': - for t in v: - for k2, v2 in t.items(): - feature_config = FeatureConfig() - if k2 == 'monophone': - training.append(MonophoneTrainer(feature_config)) - elif k2 == 'triphone': - training.append(TriphoneTrainer(feature_config)) - elif k2 == 'lda': - training.append(LdaTrainer(feature_config)) - elif k2 == 'sat': - training.append(SatTrainer(feature_config)) - elif k2 == 'ivector': - training.append(IvectorExtractorTrainer(feature_config)) - training_params.append(v2) - elif k == 'features': - global_feature_params.update(v) - else: - global_params[k] = v - feature_config = FeatureConfig() - feature_config.update(global_feature_params) - align_config = AlignConfig(feature_config) - align_config.update(global_params) - training_config = None - if training: - for i, t in enumerate(training): - if i == 0 and t.train_type != 'mono': - raise ConfigError('The first round of training must be monophone.') - t.update(global_params) - t.update(training_params[i]) - training_config = TrainingConfig(training) - align_config.feature_config.lda = training_config.uses_lda - if training_config.uses_lda: - align_config.feature_config.set_features_to_use_lda() - align_config.feature_config.fmllr = training_config.uses_sat - if align_config.beam >= align_config.retry_beam: - raise ConfigError('Retry beam must be greater than beam.') - return training_config, align_config - - -def align_yaml_to_config(path): - with open(path, 'r', encoding='utf8') as f: - data = yaml.load(f, Loader=yaml.SafeLoader) - global_params = {} - feature_config = FeatureConfig() - for k, v in data.items(): - if k == 'features': - feature_config.update(v) - else: - global_params[k] = v - align_config = AlignConfig(feature_config) - align_config.update(global_params) - if align_config.beam >= align_config.retry_beam: - raise ConfigError('Retry beam must be greater than beam.') - return align_config - - -def transcribe_yaml_to_config(path): - with open(path, 'r', encoding='utf8') as f: - data = yaml.load(f, Loader=yaml.SafeLoader) - global_params = {} - feature_config = FeatureConfig() - for k, v in data.items(): - if k == 'features': - feature_config.update(v) - else: - global_params[k] = v - config = TranscribeConfig(feature_config) - config.update(global_params) - return config - - -def load_basic_align(): - base_dir = os.path.dirname(os.path.abspath(__file__)) - align_config = align_yaml_to_config(os.path.join(base_dir, 'basic_align.yaml')) - return align_config - - -def load_basic_transcribe(): - base_dir = os.path.dirname(os.path.abspath(__file__)) - config = transcribe_yaml_to_config(os.path.join(base_dir, 'basic_transcribe.yaml')) - return config - - -def load_basic_train(): - base_dir = os.path.dirname(os.path.abspath(__file__)) - training_config, align_config = train_yaml_to_config(os.path.join(base_dir, 'basic_train.yaml')) - return training_config, align_config - - -def load_basic_train_ivector(): - base_dir = os.path.dirname(os.path.abspath(__file__)) - training_config, align_config = train_yaml_to_config(os.path.join(base_dir, 'basic_train_ivector.yaml')) - return training_config, align_config - - -def load_test_config(): - base_dir = os.path.dirname(os.path.abspath(__file__)) - training_config, align_config = train_yaml_to_config(os.path.join(base_dir, 'test_config.yaml')) - return training_config, align_config - - -def train_lm_yaml_to_config(path): - with open(path, 'r', encoding='utf8') as f: - data = yaml.load(f, Loader=yaml.SafeLoader) - config = TrainLMConfig() - config.update(data) - return config -def load_basic_train_lm(): - base_dir = os.path.dirname(os.path.abspath(__file__)) - training_config = train_lm_yaml_to_config(os.path.join(base_dir, 'basic_train_lm.yaml')) - return training_config -def save_config(config, path): - with open(path, 'w', encoding='utf8') as f: - yaml.dump(config.params(), f) diff --git a/montreal_forced_aligner/config/align_config.py b/montreal_forced_aligner/config/align_config.py new file mode 100644 index 00000000..6ba3191f --- /dev/null +++ b/montreal_forced_aligner/config/align_config.py @@ -0,0 +1,66 @@ +import os +import yaml +from .base_config import BaseConfig, ConfigError +from ..features.config import FeatureConfig + + +class AlignConfig(BaseConfig): + def __init__(self, feature_config): + self.transition_scale = 1.0 + self.acoustic_scale = 0.1 + self.self_loop_scale = 0.1 + self.feature_config = feature_config + self.boost_silence = 1.0 + self.beam = 10 + self.retry_beam = 40 + self.data_directory = None # Gets set later + self.use_mp = True + + @property + def feature_file_base_name(self): + return self.feature_config.feature_id + + @property + def align_options(self): + return {'transition_scale': self.transition_scale, + 'acoustic_scale': self.acoustic_scale, + 'self_loop_scale': self.self_loop_scale, + 'beam': self.beam, + 'retry_beam': self.retry_beam, + } + + def update(self, data): + for k, v in data.items(): + if k == 'use_mp': + self.feature_config.use_mp = v + if not hasattr(self, k): + raise ConfigError('No field found for key {}'.format(k)) + setattr(self, k, v) + + def update_from_args(self, args): + super(AlignConfig, self).update_from_args(args) + if self.retry_beam <= self.beam: + self.retry_beam = self.beam * 4 + + +def align_yaml_to_config(path): + with open(path, 'r', encoding='utf8') as f: + data = yaml.load(f, Loader=yaml.SafeLoader) + global_params = {} + feature_config = FeatureConfig() + for k, v in data.items(): + if k == 'features': + feature_config.update(v) + else: + global_params[k] = v + align_config = AlignConfig(feature_config) + align_config.update(global_params) + if align_config.beam >= align_config.retry_beam: + raise ConfigError('Retry beam must be greater than beam.') + return align_config + + +def load_basic_align(): + base_dir = os.path.dirname(os.path.abspath(__file__)) + align_config = align_yaml_to_config(os.path.join(base_dir, 'basic_align.yaml')) + return align_config \ No newline at end of file diff --git a/montreal_forced_aligner/config/base_config.py b/montreal_forced_aligner/config/base_config.py new file mode 100644 index 00000000..5f7faa49 --- /dev/null +++ b/montreal_forced_aligner/config/base_config.py @@ -0,0 +1,31 @@ +import yaml +from ..exceptions import ConfigError + + +class BaseConfig(object): + def update(self, data): + for k, v in data.items(): + if not hasattr(self, k): + raise ConfigError('No field found for key {}'.format(k)) + setattr(self, k, v) + + def update_from_args(self, args): + for i, a in enumerate(args): + if not a.startswith('--'): + continue + name = a.replace('--', '') + try: + original_value = getattr(self, name) + except AttributeError: + continue + if not isinstance(original_value, (bool, int, float, str)): + continue + try: + val = type(original_value)(args[i+1]) + except (IndexError, ValueError): + continue + setattr(self, name, val) + +def save_config(config, path): + with open(path, 'w', encoding='utf8') as f: + yaml.dump(config.params(), f) diff --git a/montreal_forced_aligner/config/basic_classification.yaml b/montreal_forced_aligner/config/basic_classification.yaml new file mode 100644 index 00000000..e69de29b diff --git a/montreal_forced_aligner/config/basic_segmentation.yaml b/montreal_forced_aligner/config/basic_segmentation.yaml new file mode 100644 index 00000000..748c08b3 --- /dev/null +++ b/montreal_forced_aligner/config/basic_segmentation.yaml @@ -0,0 +1,4 @@ +energy_threshold: 5.5 +energy_mean_scale: 0.5 +max_segment_length: 30 +min_pause_duration: 0.05 \ No newline at end of file diff --git a/montreal_forced_aligner/config/basic_train.yaml b/montreal_forced_aligner/config/basic_train.yaml index ec79d08f..1325768b 100644 --- a/montreal_forced_aligner/config/basic_train.yaml +++ b/montreal_forced_aligner/config/basic_train.yaml @@ -36,19 +36,13 @@ training: max_gaussians: 15000 power: 0.2 silence_weight: 0.0 - fmllr_update_type: "diag" + fmllr_update_type: "full" subset: 10000 - features: - lda: true - sat: num_leaves: 4200 max_gaussians: 40000 power: 0.2 silence_weight: 0.0 - fmllr_update_type: "diag" - subset: 30000 - features: - lda: true - fmllr: true + fmllr_update_type: "full" diff --git a/montreal_forced_aligner/config/basic_train_ivector.yaml b/montreal_forced_aligner/config/basic_train_ivector.yaml index 36ec8d00..e6da6f4e 100644 --- a/montreal_forced_aligner/config/basic_train_ivector.yaml +++ b/montreal_forced_aligner/config/basic_train_ivector.yaml @@ -1,44 +1,12 @@ -beam: 10 -retry_beam: 400 - features: type: "mfcc" - use_energy: false + use_energy: true frame_shift: 10 - pitch: false training: - - monophone: - num_iterations: 40 - max_gaussians: 1000 - subset: 2000 - boost_silence: 1.25 - - - triphone: - num_iterations: 35 - num_leaves: 2000 - max_gaussians: 10000 - cluster_threshold: -1 - subset: 5000 - boost_silence: 1.25 - power: 0.25 - - - lda: - num_leaves: 2500 - max_gaussians: 15000 - subset: 10000 - num_iterations: 35 - features: - splice_left_context: 3 - splice_right_context: 3 - - ivector: num_iterations: 10 gaussian_min_count: 2 silence_weight: 0.0 posterior_scale: 0.1 - max_count: 100 - features: - lda: true - splice_left_context: 3 - splice_right_context: 3 \ No newline at end of file + max_count: 100 \ No newline at end of file diff --git a/montreal_forced_aligner/config/segmentation_config.py b/montreal_forced_aligner/config/segmentation_config.py new file mode 100644 index 00000000..7ed22fb0 --- /dev/null +++ b/montreal_forced_aligner/config/segmentation_config.py @@ -0,0 +1,44 @@ +import yaml +import os +from .base_config import BaseConfig, ConfigError +from ..features.config import FeatureConfig + + +class SegmentationConfig(BaseConfig): + def __init__(self, feature_config): + self.use_mp = True + self.energy_threshold = 5.5 + self.energy_mean_scale = 0.5 + self.max_segment_length = 30 + self.min_pause_duration = 0.05 + self.feature_config = feature_config + self.feature_config.use_energy = True + + def update(self, data): + for k, v in data.items(): + if k == 'use_mp': + self.feature_config.use_mp = v + if not hasattr(self, k): + raise ConfigError('No field found for key {}'.format(k)) + setattr(self, k, v) + + +def segmentation_yaml_to_config(path): + with open(path, 'r', encoding='utf8') as f: + data = yaml.load(f, Loader=yaml.SafeLoader) + global_params = {} + feature_config = FeatureConfig() + for k, v in data.items(): + if k == 'features': + feature_config.update(v) + else: + global_params[k] = v + segmentation_config = SegmentationConfig(feature_config) + segmentation_config.update(global_params) + return segmentation_config + + +def load_basic_segmentation(): + base_dir = os.path.dirname(os.path.abspath(__file__)) + segmentation_config = segmentation_yaml_to_config(os.path.join(base_dir, 'basic_segmentation.yaml')) + return segmentation_config \ No newline at end of file diff --git a/montreal_forced_aligner/config/speaker_classification_config.py b/montreal_forced_aligner/config/speaker_classification_config.py new file mode 100644 index 00000000..e2f16c7a --- /dev/null +++ b/montreal_forced_aligner/config/speaker_classification_config.py @@ -0,0 +1,30 @@ +import yaml +import os +from .base_config import BaseConfig, ConfigError + + +class SpeakerClassificationConfig(BaseConfig): + def __init__(self): + self.use_mp = True + self.pca_dimension = -1 + self.target_energy = 0.1 + self.cluster_threshold = 0.5 + self.max_speaker_fraction = 1.0 + self.first_pass_max_utterances = 32767 + self.rttm_channel = 0 + self.read_costs = False + + +def classification_yaml_to_config(path): + with open(path, 'r', encoding='utf8') as f: + data = yaml.load(f, Loader=yaml.SafeLoader) + classification_config = SpeakerClassificationConfig() + if data: + classification_config.update(data) + return classification_config + + +def load_basic_classification(): + base_dir = os.path.dirname(os.path.abspath(__file__)) + classification_config = classification_yaml_to_config(os.path.join(base_dir, 'basic_classification.yaml')) + return classification_config diff --git a/montreal_forced_aligner/config/train_config.py b/montreal_forced_aligner/config/train_config.py new file mode 100644 index 00000000..463a328d --- /dev/null +++ b/montreal_forced_aligner/config/train_config.py @@ -0,0 +1,119 @@ + +import os +import yaml +from .base_config import BaseConfig, ConfigError +from ..features.config import FeatureConfig +from collections import Counter + +from ..trainers import MonophoneTrainer, TriphoneTrainer, LdaTrainer, SatTrainer, IvectorExtractorTrainer + +from .align_config import AlignConfig + + +class TrainingConfig(BaseConfig): + def __init__(self, training_configs): + self.training_configs = training_configs + counts = Counter([x.train_type for x in self.training_configs]) + self.training_identifiers = [] + curs = {x.train_type: 1 for x in self.training_configs} + for t in training_configs: + i = t.train_type + if counts[t.train_type] != 1: + i += str(curs[t.train_type]) + curs[t.train_type] += 1 + self.training_identifiers.append(i) + + def keys(self): + return self.training_identifiers + + def values(self): + return self.training_configs + + def items(self): + return zip(self.training_identifiers, self.training_configs) + + def __getitem__(self, item): + if item not in self.training_identifiers: + raise KeyError('{} not a valid training identifier'.format(item)) + return self.training_configs[self.training_identifiers.index(item)] + + @property + def uses_lda(self): + for k in self.keys(): + if k.startswith('lda'): + return True + return False + + @property + def uses_sat(self): + for k in self.keys(): + if k.startswith('sat'): + return True + return False + + +def train_yaml_to_config(path): + with open(path, 'r', encoding='utf8') as f: + data = yaml.load(f, Loader=yaml.SafeLoader) + global_params = {} + training = [] + training_params = [] + global_feature_params = {} + for k, v in data.items(): + if k == 'training': + for t in v: + for k2, v2 in t.items(): + feature_config = FeatureConfig() + if k2 == 'monophone': + training.append(MonophoneTrainer(feature_config)) + elif k2 == 'triphone': + training.append(TriphoneTrainer(feature_config)) + elif k2 == 'lda': + training.append(LdaTrainer(feature_config)) + elif k2 == 'sat': + training.append(SatTrainer(feature_config)) + elif k2 == 'ivector': + training.append(IvectorExtractorTrainer(feature_config)) + training_params.append(v2) + elif k == 'features': + global_feature_params.update(v) + else: + global_params[k] = v + feature_config = FeatureConfig() + feature_config.update(global_feature_params) + align_config = AlignConfig(feature_config) + align_config.update(global_params) + training_config = None + if training: + for i, t in enumerate(training): + if i == 0 and t.train_type not in ['mono', 'ivector']: + raise ConfigError('The first round of training must be monophone.') + t.update(global_params) + t.update(training_params[i]) + t.feature_config.update(global_feature_params) + training_config = TrainingConfig(training) + align_config.feature_config.lda = training_config.uses_lda + if training_config.uses_lda: + align_config.feature_config.set_features_to_use_lda() + align_config.feature_config.fmllr = training_config.uses_sat + if align_config.beam >= align_config.retry_beam: + raise ConfigError('Retry beam must be greater than beam.') + return training_config, align_config + + +def load_basic_train(): + base_dir = os.path.dirname(os.path.abspath(__file__)) + training_config, align_config = train_yaml_to_config(os.path.join(base_dir, 'basic_train.yaml')) + return training_config, align_config + + +def load_basic_train_ivector(): + base_dir = os.path.dirname(os.path.abspath(__file__)) + training_config, align_config = train_yaml_to_config(os.path.join(base_dir, 'basic_train_ivector.yaml')) + return training_config, align_config + + +def load_test_config(): + base_dir = os.path.dirname(os.path.abspath(__file__)) + training_config, align_config = train_yaml_to_config(os.path.join(base_dir, 'test_config.yaml')) + return training_config, align_config \ No newline at end of file diff --git a/montreal_forced_aligner/config/train_lm_config.py b/montreal_forced_aligner/config/train_lm_config.py new file mode 100644 index 00000000..ac4246db --- /dev/null +++ b/montreal_forced_aligner/config/train_lm_config.py @@ -0,0 +1,27 @@ +import os +import yaml +from .base_config import BaseConfig + + +class TrainLMConfig(BaseConfig): + def __init__(self): + self.order = 3 + self.method = 'kneser_ney' + self.prune = False + self.count_threshold = 1 + self.prune_thresh_small = 0.0000003 + self.prune_thresh_medium = 0.0000001 + + +def train_lm_yaml_to_config(path): + with open(path, 'r', encoding='utf8') as f: + data = yaml.load(f, Loader=yaml.SafeLoader) + config = TrainLMConfig() + config.update(data) + return config + + +def load_basic_train_lm(): + base_dir = os.path.dirname(os.path.abspath(__file__)) + training_config = train_lm_yaml_to_config(os.path.join(base_dir, 'basic_train_lm.yaml')) + return training_config diff --git a/montreal_forced_aligner/config/transcribe_config.py b/montreal_forced_aligner/config/transcribe_config.py new file mode 100644 index 00000000..3f6a82df --- /dev/null +++ b/montreal_forced_aligner/config/transcribe_config.py @@ -0,0 +1,77 @@ +import os +import yaml +from .base_config import BaseConfig, ConfigError +from ..features.config import FeatureConfig + + +class TranscribeConfig(BaseConfig): + def __init__(self, feature_config): + self.transition_scale = 1.0 + self.acoustic_scale = 0.083333 + self.self_loop_scale = 0.1 + self.feature_config = feature_config + self.silence_weight = 0.01 + self.beam = 10 + self.max_active = 7000 + self.fmllr = False + self.fmllr_update_type = 'full' + self.lattice_beam = 6 + self.first_beam = None + self.first_max_active = None + self.max_fmllr_jobs = 12 + self.language_model_weight = 10 + self.word_insertion_penalty = 0.5 + self.data_directory = None # Gets set later + self.use_mp = True + + def params(self): + return { + 'transition_scale': self.transition_scale, + 'acoustic_scale': self.acoustic_scale, + 'self_loop_scale': self.self_loop_scale, + 'silence_weight': self.silence_weight, + 'beam': self.beam, + 'max_active': self.max_active, + 'fmllr': self.fmllr, + 'fmllr_update_type': self.fmllr_update_type, + 'lattice_beam': self.lattice_beam, + 'first_beam': self.first_beam, + 'first_max_active': self.first_max_active, + 'max_fmllr_jobs': self.max_fmllr_jobs, + 'language_model_weight': self.language_model_weight, + 'word_insertion_penalty': self.word_insertion_penalty, + 'use_mp': self.use_mp, + } + + @property + def feature_file_base_name(self): + return self.feature_config.feature_id + + def update(self, data): + for k, v in data.items(): + if k == 'use_mp': + self.feature_config.use_mp = v + if not hasattr(self, k): + raise ConfigError('No field found for key {}'.format(k)) + setattr(self, k, v) + + +def transcribe_yaml_to_config(path): + with open(path, 'r', encoding='utf8') as f: + data = yaml.load(f, Loader=yaml.SafeLoader) + global_params = {} + feature_config = FeatureConfig() + for k, v in data.items(): + if k == 'features': + feature_config.update(v) + else: + global_params[k] = v + config = TranscribeConfig(feature_config) + config.update(global_params) + return config + + +def load_basic_transcribe(): + base_dir = os.path.dirname(os.path.abspath(__file__)) + config = transcribe_yaml_to_config(os.path.join(base_dir, 'basic_transcribe.yaml')) + return config \ No newline at end of file diff --git a/montreal_forced_aligner/corpus/align_corpus.py b/montreal_forced_aligner/corpus/align_corpus.py index 79796efe..e25ffe00 100644 --- a/montreal_forced_aligner/corpus/align_corpus.py +++ b/montreal_forced_aligner/corpus/align_corpus.py @@ -2,21 +2,20 @@ import sys import traceback import random +import time +import re from collections import Counter from textgrid import TextGrid, IntervalTier +from ..helper import load_text, output_mapping, save_groups, filter_scp, load_scp -from ..dictionary import sanitize -from ..helper import load_text, output_mapping, save_groups, filter_scp +from ..exceptions import SampleRateError, CorpusError, WavReadError, SampleRateMismatchError, \ + BitDepthError, TextParseError, TextGridParseError -from ..exceptions import SampleRateError, CorpusError - -from .base import BaseCorpus, get_sample_rate, get_n_channels, get_wav_duration, extract_temp_channels, get_bit_depth, find_ext - - -def parse_transcription(text): - words = [sanitize(x) for x in text.split()] - words = [x for x in words if x not in ['', '-', "'"]] - return words +from .base import BaseCorpus, extract_temp_channels, find_exts, get_wav_info +import multiprocessing as mp +from queue import Empty +from ..multiprocessing.helper import Stopped +from ..multiprocessing.corpus import CorpusProcessWorker, parse_transcription, parse_lab_file, parse_textgrid_file class AlignableCorpus(BaseCorpus): @@ -53,13 +52,12 @@ class AlignableCorpus(BaseCorpus): def __init__(self, directory, output_directory, speaker_characters=0, - num_jobs=3, debug=False): + num_jobs=3, debug=False, logger=None, use_mp=True): super(AlignableCorpus, self).__init__(directory, output_directory, speaker_characters, - num_jobs, debug) + num_jobs, debug, logger, use_mp) # Set up mapping dictionaries self.utt_text_file_mapping = {} - self.text_mapping = {} self.word_counts = Counter() self.utterance_oovs = {} self.no_transcription_files = [] @@ -67,155 +65,271 @@ def __init__(self, directory, output_directory, self.transcriptions_without_wavs = [] self.tg_count = 0 self.lab_count = 0 + + loaded = self._load_from_temp() + if not loaded: + if self.use_mp: + self._load_from_source_mp() + else: + self._load_from_source() + self.check_warnings() + self.find_best_groupings() + + def _load_from_temp(self): + begin_time = time.time() + feat_path = os.path.join(self.output_directory, 'feats.scp') + if not os.path.exists(feat_path): + return False + cmvn_path = os.path.join(self.output_directory, 'cmvn.scp') + if not os.path.exists(cmvn_path): + return False + utt2spk_path = os.path.join(self.output_directory, 'utt2spk') + if not os.path.exists(utt2spk_path): + return False + spk2utt_path = os.path.join(self.output_directory, 'spk2utt') + if not os.path.exists(spk2utt_path): + return False + text_path = os.path.join(self.output_directory, 'text') + if not os.path.exists(text_path): + return False + sr_path = os.path.join(self.output_directory, 'sr.scp') + if not os.path.exists(sr_path): + return False + wav_path = os.path.join(self.output_directory, 'wav.scp') + if not os.path.exists(wav_path): + return False + text_file_path = os.path.join(self.output_directory, 'text_file.scp') + if not os.path.exists(text_file_path): + return False + file_directory_path = os.path.join(self.output_directory, 'file_directory.scp') + if not os.path.exists(file_directory_path): + return False + wav_info_path = os.path.join(self.output_directory, 'wav_info.scp') + if not os.path.exists(wav_info_path): + return False + self.feat_mapping = load_scp(feat_path) + self.cmvn_mapping = load_scp(cmvn_path) + self.utt_speak_mapping = load_scp(utt2spk_path) + self.speak_utt_mapping = load_scp(spk2utt_path) + for speak, utts in self.speak_utt_mapping.items(): + if not isinstance(utts, list): + self.speak_utt_mapping[speak] = [utts] + self.text_mapping = load_scp(text_path) + for utt, text in self.text_mapping.items(): + for w in text: + new_w = re.split(r"[-']", w) + self.word_counts.update(new_w + [w]) + self.text_mapping[utt] = ' '.join(text) + self.sample_rates = {int(k): v for k,v in load_scp(sr_path).items()} + self.utt_wav_mapping = load_scp(wav_path) + self.wav_info = load_scp(wav_info_path, float) + self.utt_text_file_mapping = load_scp(text_file_path) + for p in self.utt_text_file_mapping.values(): + if p.lower().endswith('.textgrid'): + self.tg_count += 1 + else: + self.lab_count += 1 + self.file_directory_mapping = load_scp(file_directory_path) + segments_path = os.path.join(self.output_directory, 'segments.scp') + if os.path.exists(segments_path): + self.segments = load_scp(segments_path) + speaker_ordering_path = os.path.join(self.output_directory, 'speaker_ordering.scp') + if os.path.exists(speaker_ordering_path): + self.speaker_ordering = load_scp(speaker_ordering_path) + self.logger.debug('Loaded from corpus_data temp directory in {} seconds'.format(time.time()-begin_time)) + return True + + def _load_from_source_mp(self): + begin_time = time.time() + manager = mp.Manager() + job_queue = manager.Queue() + return_queue = manager.Queue() + return_dict = manager.dict() + stopped = Stopped() + + procs = [] + for i in range(self.num_jobs): + p = CorpusProcessWorker(job_queue, return_dict, return_queue, stopped) + procs.append(p) + p.start() + for root, dirs, files in os.walk(self.directory, followlinks=True): - wav_files = find_ext(files, '.wav') - lab_files = find_ext(files, '.lab') - txt_files = find_ext(files, '.txt') - textgrid_files = find_ext(files, '.textgrid') - for f in sorted(files): - file_name, ext = os.path.splitext(f) - if ext.lower() != '.wav': - if ext.lower() in ['.lab', '.textgrid']: - wav_path = wav_files[file_name] - if wav_path is None: - self.transcriptions_without_wavs.append(os.path.join(root, f)) - continue + wav_files, lab_files, textgrid_files = find_exts(files) + relative_path = root.replace(self.directory, '').lstrip('/').lstrip('\\') + for file_name, f in wav_files.items(): wav_path = os.path.join(root, f) - try: - sr = get_sample_rate(wav_path) - except Exception: - self.wav_read_errors.append(wav_path) - continue - if sr < 16000: - self.unsupported_sample_rate.append(wav_path) - bit_depth = get_bit_depth(wav_path) - if bit_depth != 16: - self.unsupported_bit_depths.append(wav_path) + if file_name in lab_files: + lab_name = lab_files[file_name] + transcription_path = os.path.join(root, lab_name) + + elif file_name in textgrid_files: + tg_name = textgrid_files[file_name] + transcription_path = os.path.join(root, tg_name) + else: + self.no_transcription_files.append(wav_path) continue - # .lab files have higher priority than .txt files - lab_name = lab_files[file_name] if file_name in lab_files else txt_files[file_name] - if lab_name is not None: - utt_name = file_name - if utt_name in self.utt_wav_mapping: - ind = 0 - fixed_utt_name = utt_name - while fixed_utt_name not in self.utt_wav_mapping: - ind += 1 - fixed_utt_name = utt_name + '_{}'.format(ind) - utt_name = fixed_utt_name - if self.feat_mapping and utt_name not in self.feat_mapping: - self.ignored_utterances.append(utt_name) - continue + job_queue.put((file_name, wav_path, transcription_path, relative_path, self.speaker_characters, self.temp_directory)) + job_queue.join() + stopped.stop() + for p in procs: + p.join() + + while True: + try: + info = return_queue.get(timeout=1) + except Empty: + break + if 'segments' not in info: # was a lab file + utt_name = info['utt_name'] + speaker_name = info['speaker_name'] + wav_info = info['wav_info'] + sr = wav_info['sample_rate'] + if utt_name in self.utt_wav_mapping: + ind = 0 + fixed_utt_name = utt_name + while fixed_utt_name not in self.utt_wav_mapping: + ind += 1 + fixed_utt_name = utt_name + '_{}'.format(ind) + utt_name = fixed_utt_name + file_name = utt_name + words = info['words'] + for w in words: + new_w = re.split(r"[-']", w) + self.word_counts.update(new_w + [w]) + self.text_mapping[utt_name] = ' '.join(words) + self.utt_text_file_mapping[utt_name] = info['text_file'] + self.speak_utt_mapping[speaker_name].append(utt_name) + self.utt_wav_mapping[utt_name] = info['wav_path'] + self.sample_rates[sr].add(speaker_name) + self.utt_speak_mapping[utt_name] = speaker_name + self.file_directory_mapping[utt_name] = info['relative_path'] + self.lab_count += 1 + else: + wav_info = info['wav_info'] + sr = wav_info['sample_rate'] + file_name = info['recording_name'] + self.wav_files.append(file_name) + self.speaker_ordering[file_name] = info['speaker_ordering'] + for s in info['speaker_ordering']: + self.sample_rates[sr].add(s) + self.segments.update(info['segments']) + self.utt_wav_mapping.update(info['utt_wav_mapping']) + self.utt_text_file_mapping.update(info['utt_text_file_mapping']) + for utt, words in info['text_mapping'].items(): + for w in words: + new_w = re.split(r"[-']", w) + self.word_counts.update(new_w + [w]) + self.text_mapping[utt] = ' '.join(words) + self.utt_speak_mapping.update(info['utt_speak_mapping']) + for speak, utts in info['speak_utt_mapping'].items(): + if speak not in self.speak_utt_mapping: + self.speak_utt_mapping[speak] = utts + else: + self.speak_utt_mapping[speak].extend(utts) + for fn in info['file_names']: + self.file_directory_mapping[fn] = info['relative_path'] + self.tg_count += 1 + self.wav_info[file_name] = [wav_info['num_channels'], wav_info['sample_rate'], wav_info['duration']] + for k in ['wav_read_errors', 'unsupported_sample_rate', 'unsupported_bit_depths', + 'decode_error_files', 'textgrid_read_errors']: + if hasattr(self, k): + if k in return_dict: + if k == 'textgrid_read_errors': + getattr(self, k).update(return_dict[k]) + else: + setattr(self, k, return_dict[k]) + self.logger.debug('Parsed corpus directory with {} jobs in {} seconds'.format(self.num_jobs, time.time()-begin_time)) + + def _load_from_source(self): + begin_time = time.time() + for root, dirs, files in os.walk(self.directory, followlinks=True): + wav_files, lab_files, textgrid_files = find_exts(files) + relative_path = root.replace(self.directory, '').lstrip('/').lstrip('\\') + for file_name, f in wav_files.items(): + wav_path = os.path.join(root, f) + if file_name in lab_files: + lab_name = lab_files[file_name] lab_path = os.path.join(root, lab_name) try: - text = load_text(lab_path) - except UnicodeDecodeError: + info = parse_lab_file(file_name, wav_path, lab_path, relative_path, speaker_characters=self.speaker_characters) + utt_name = info['utt_name'] + speaker_name = info['speaker_name'] + wav_info = info['wav_info'] + sr = wav_info['sample_rate'] + if utt_name in self.utt_wav_mapping: + ind = 0 + fixed_utt_name = utt_name + while fixed_utt_name not in self.utt_wav_mapping: + ind += 1 + fixed_utt_name = utt_name + '_{}'.format(ind) + utt_name = fixed_utt_name + + words = info['words'] + for w in words: + new_w = re.split(r"[-']", w) + self.word_counts.update(new_w + [w]) + self.text_mapping[utt_name] = ' '.join(words) + self.utt_text_file_mapping[utt_name] = lab_path + self.speak_utt_mapping[speaker_name].append(utt_name) + self.utt_wav_mapping[utt_name] = wav_path + self.sample_rates[sr].add(speaker_name) + self.utt_speak_mapping[utt_name] = speaker_name + self.file_directory_mapping[utt_name] = relative_path + self.lab_count += 1 + except WavReadError: + self.wav_read_errors.append(wav_path) + except SampleRateError: + self.unsupported_sample_rate.append(wav_path) + except BitDepthError: + self.unsupported_bit_depths.append(wav_path) + except TextParseError: self.decode_error_files.append(lab_path) - continue - words = parse_transcription(text) - if not words: - continue - self.word_counts.update(words) - self.text_mapping[utt_name] = ' '.join(words) - if self.speaker_directories: - speaker_name = os.path.basename(root) - else: - if isinstance(speaker_characters, int): - speaker_name = f[:speaker_characters] - elif speaker_characters == 'prosodylab': - speaker_name = f.split('_')[1] - else: - speaker_name = f - speaker_name = speaker_name.strip().replace(' ', '_') - utt_name = utt_name.strip().replace(' ', '_') - self.utt_text_file_mapping[utt_name] = lab_path - self.speak_utt_mapping[speaker_name].append(utt_name) - self.utt_wav_mapping[utt_name] = wav_path - self.sample_rates[get_sample_rate(wav_path)].add(speaker_name) - self.utt_speak_mapping[utt_name] = speaker_name - self.file_directory_mapping[utt_name] = root.replace(self.directory, '').lstrip('/').lstrip('\\') - - self.lab_count += 1 - else: + + elif file_name in textgrid_files: tg_name = textgrid_files[file_name] - if tg_name is None: - self.no_transcription_files.append(wav_path) - continue - self.wav_files.append(file_name) - self.wav_durations[file_name] = get_wav_duration(wav_path) tg_path = os.path.join(root, tg_name) - tg = TextGrid() try: - tg.read(tg_path) - except Exception as e: - exc_type, exc_value, exc_traceback = sys.exc_info() - self.textgrid_read_errors[tg_path] = '\n'.join( - traceback.format_exception(exc_type, exc_value, exc_traceback)) - n_channels = get_n_channels(wav_path) - num_tiers = len(tg.tiers) - if n_channels == 2: - a_name = file_name + "_A" - b_name = file_name + "_B" - - a_path, b_path = extract_temp_channels(wav_path, self.temp_directory) - elif n_channels > 2: - raise (Exception('More than two channels')) - self.speaker_ordering[file_name] = [] - if not self.speaker_directories: - if isinstance(speaker_characters, int): - speaker_name = f[:speaker_characters] - elif speaker_characters == 'prosodylab': - speaker_name = f.split('_')[1] - else: - speaker_name = f - speaker_name = speaker_name.strip().replace(' ', '_') - self.speaker_ordering[file_name].append(speaker_name) - for i, ti in enumerate(tg.tiers): - if ti.name.lower() == 'notes': - continue - if not isinstance(ti, IntervalTier): - continue - if self.speaker_directories: - speaker_name = ti.name.strip().replace(' ', '_') - self.speaker_ordering[file_name].append(speaker_name) - self.sample_rates[get_sample_rate(wav_path)].add(speaker_name) - for interval in ti: - text = interval.mark.lower().strip() - words = parse_transcription(text) - if not words: - continue - begin, end = round(interval.minTime, 4), round(interval.maxTime, 4) - utt_name = '{}_{}_{}_{}'.format(speaker_name, file_name, begin, end) - utt_name = utt_name.strip().replace(' ', '_').replace('.', '_') - if n_channels == 1: - if self.feat_mapping and utt_name not in self.feat_mapping: - self.ignored_utterances.append(utt_name) - self.segments[utt_name] = '{} {} {}'.format(file_name, begin, end) - self.utt_wav_mapping[file_name] = wav_path + info = parse_textgrid_file(file_name, wav_path, tg_path, relative_path, + self.speaker_characters, self.temp_directory) + wav_info = info['wav_info'] + sr = wav_info['sample_rate'] + self.wav_files.append(file_name) + self.speaker_ordering[file_name] = info['speaker_ordering'] + for s in info['speaker_ordering']: + self.sample_rates[sr].add(s) + self.segments.update(info['segments']) + self.utt_wav_mapping.update(info['utt_wav_mapping']) + self.utt_text_file_mapping.update(info['utt_text_file_mapping']) + for utt, words in info['text_mapping'].items(): + for w in words: + new_w = re.split(r"[-']", w) + self.word_counts.update(new_w + [w]) + self.text_mapping[utt] = ' '.join(words) + self.utt_speak_mapping.update(info['utt_speak_mapping']) + for speak, utts in info['speak_utt_mapping'].items(): + if speak not in self.speak_utt_mapping: + self.speak_utt_mapping[speak] = utts else: - if i < num_tiers / 2: - utt_name += '_A' - if self.feat_mapping and utt_name not in self.feat_mapping: - self.ignored_utterances.append(utt_name) - self.segments[utt_name] = '{} {} {}'.format(a_name, begin, end) - self.utt_wav_mapping[a_name] = a_path - else: - utt_name += '_B' - if self.feat_mapping and utt_name not in self.feat_mapping: - self.ignored_utterances.append(utt_name) - self.segments[utt_name] = '{} {} {}'.format(b_name, begin, end) - self.utt_wav_mapping[b_name] = b_path - self.text_mapping[utt_name] = ' '.join(words) - self.utt_text_file_mapping[utt_name] = tg_path - self.word_counts.update(words) - self.utt_speak_mapping[utt_name] = speaker_name - self.speak_utt_mapping[speaker_name].append(utt_name) - if n_channels == 2: - self.file_directory_mapping[a_name] = root.replace(self.directory, '').lstrip('/').lstrip('\\') - self.file_directory_mapping[b_name] = root.replace(self.directory, '').lstrip('/').lstrip('\\') - self.file_directory_mapping[file_name] = root.replace(self.directory, '').lstrip('/').lstrip('\\') - self.tg_count += 1 + self.speak_utt_mapping[speak].extend(utts) + for fn in info['file_names']: + self.file_directory_mapping[fn] = relative_path + self.tg_count += 1 + except WavReadError: + self.wav_read_errors.append(wav_path) + except SampleRateError: + self.unsupported_sample_rate.append(wav_path) + except BitDepthError: + self.unsupported_bit_depths.append(wav_path) + except TextGridParseError as e: + self.textgrid_read_errors[tg_path] = e.error + else: + self.no_transcription_files.append(wav_path) + continue + self.wav_info[file_name] = [wav_info['num_channels'], wav_info['sample_rate'], wav_info['duration']] + self.logger.debug('Parsed corpus directory in {} seconds'.format(time.time()-begin_time)) + + def check_warnings(self): self.issues_check = self.ignored_utterances or self.no_transcription_files or \ self.textgrid_read_errors or self.unsupported_sample_rate or self.decode_error_files @@ -231,7 +345,7 @@ def __init__(self, directory, output_directory, msg = 'The following speakers had multiple speaking rates: {}. ' \ 'Please make sure that each speaker has a consistent sampling rate.'.format(', '.join(bad_speakers)) self.logger.error(msg) - raise (SampleRateError(msg)) + raise (SampleRateMismatchError(msg)) if len(self.speak_utt_mapping) < self.num_jobs: self.num_jobs = len(self.speak_utt_mapping) @@ -240,9 +354,7 @@ def __init__(self, directory, output_directory, msg = 'The number of jobs was set to {}, due to the different sample rates in the dataset. ' \ 'If you would like to use fewer parallel jobs, ' \ 'please resample all wav files to the same sample rate.'.format(self.num_jobs) - print('WARNING: ' + msg) self.logger.warning(msg) - self.find_best_groupings() def update_utterance_text(self, utterance, new_text): new_text = new_text.lower().strip() @@ -253,7 +365,6 @@ def update_utterance_text(self, utterance, new_text): found = False tg.read(text_file_path) - print(utterance) speaker_name = utterance.split('_', maxsplit=1) wave_name, begin, end = self.segments[utterance].split(' ') begin = float(begin) @@ -275,18 +386,14 @@ def update_utterance_text(self, utterance, new_text): if found: tg.write(text_file_path) else: - print('Unable to find utterance {} match in {}'.format(utterance, text_file_path)) + self.logger.warning('Unable to find utterance {} match in {}'.format(utterance, text_file_path)) else: with open(text_file_path, 'w', encoding='utf8') as f: f.write(new_text) - @property - def ivector_directory(self): - return os.path.join(self.output_directory, 'ivectors') - @property def word_set(self): - return set(self.word_counts) + return list(self.word_counts) def normalized_text_iter(self, dictionary=None, min_count=1): unk_words = set(k for k, v in self.word_counts.items() if v <= min_count) @@ -309,7 +416,6 @@ def normalized_text_iter(self, dictionary=None, min_count=1): new_text.append(item) yield ' '.join(new_text) - def grouped_text(self, dictionary=None): output = [] for g in self.groups: @@ -348,17 +454,17 @@ def grouped_text_int(self, dictionary): if u in self.ignored_utterances: continue oovs = [] + new_text = [] for i in range(len(text)): t = text[i] lookup = dictionary.to_int(t) - if lookup is None: - continue - if lookup == oov_code: - oovs.append(t) - text[i] = lookup + for w in lookup: + if w == oov_code: + oovs.append(text[i]) + new_text.append(w) if oovs: self.utterance_oovs[u] = oovs - new_text = map(str, (x for x in text if isinstance(x, int))) + new_text = map(str, (x for x in new_text if isinstance(x, int))) output_g.append([u, ' '.join(new_text)]) output.append(output_g) return output @@ -404,20 +510,29 @@ def grouped_utt2fst(self, dictionary, num_frequent_words=10): return output def subset_directory(self, subset, feature_config): - if subset is None or subset > self.num_utterances: + if subset is None or subset > self.num_utterances or subset <= 0: return self.split_directory() directory = os.path.join(self.output_directory, 'subset_{}'.format(subset)) - if not os.path.exists(directory): - self.create_subset(subset, feature_config) + self.create_subset(subset, feature_config) return directory def write(self): super(AlignableCorpus, self).write() self._write_text() + self._write_utt_text_file() + self._write_speaker_ordering() def _write_text(self): - text = os.path.join(self.output_directory, 'text') - output_mapping(self.text_mapping, text) + path = os.path.join(self.output_directory, 'text') + output_mapping(self.text_mapping, path) + + def _write_utt_text_file(self): + path = os.path.join(self.output_directory, 'text_file.scp') + output_mapping(self.utt_text_file_mapping, path) + + def _write_speaker_ordering(self): + path = os.path.join(self.output_directory, 'speaker_ordering.scp') + output_mapping(self.speaker_ordering, path) def _split_utt2fst(self, directory, dictionary): pattern = 'utt2fst.{}' @@ -442,48 +557,71 @@ def initialize_corpus(self, dictionary): raise CorpusError('There were no wav files found for transcribing this corpus. Please validate the corpus.') split_dir = self.split_directory() self.write() - if not os.path.exists(split_dir): - self.split(dictionary) + self.split(dictionary) self.figure_utterance_lengths() def create_subset(self, subset, feature_config): - larger_subset_num = subset * 10 - if larger_subset_num < self.num_utterances: - # Get all shorter utterances that are not one word long - utts = sorted((x for x in self.utterance_lengths.keys() if ' ' in self.text_mapping[x]), - key=lambda x: self.utterance_lengths[x]) - larger_subset = utts[:larger_subset_num] - else: - larger_subset = self.utterance_lengths.keys() - subset_utts = set(random.sample(larger_subset, subset)) split_directory = self.split_directory() subset_directory = os.path.join(self.output_directory, 'subset_{}'.format(subset)) - log_dir = os.path.join(subset_directory, 'log') - os.makedirs(log_dir, exist_ok=True) subset_utt_path = os.path.join(subset_directory, 'included_utts.txt') - with open(subset_utt_path, 'w', encoding='utf8') as f: - for u in subset_utts: - f.write('{}\n'.format(u)) + if os.path.exists(subset_utt_path): + subset_utts = [] + with open(subset_utt_path, 'r', encoding='utf8') as f: + for line in f: + subset_utts.append(line.strip()) + else: + larger_subset_num = subset * 10 + if larger_subset_num < self.num_utterances: + # Get all shorter utterances that are not one word long + utts = sorted((x for x in self.utterance_lengths.keys() if ' ' in self.text_mapping[x]), + key=lambda x: self.utterance_lengths[x]) + larger_subset = utts[:larger_subset_num] + else: + larger_subset = self.utterance_lengths.keys() + random.seed(1234) # make it deterministic sampling + subset_utts = set(random.sample(larger_subset, subset)) + log_dir = os.path.join(subset_directory, 'log') + os.makedirs(log_dir, exist_ok=True) + with open(subset_utt_path, 'w', encoding='utf8') as f: + for u in subset_utts: + f.write('{}\n'.format(u)) for j in range(self.num_jobs): for fn in ['text.{}', 'text.{}.int', 'utt2spk.{}']: + sub_path = os.path.join(subset_directory, fn.format(j)) with open(os.path.join(split_directory, fn.format(j)), 'r', encoding='utf8') as inf, \ - open(os.path.join(subset_directory, fn.format(j)), 'w', encoding='utf8') as outf: + open(sub_path, 'w', encoding='utf8') as outf: for line in inf: s = line.split() if s[0] not in subset_utts: continue outf.write(line) + subset_speakers = [] + sub_path = os.path.join(subset_directory, 'spk2utt.{}'.format(j)) with open(os.path.join(split_directory, 'spk2utt.{}'.format(j)), 'r', encoding='utf8') as inf, \ - open(os.path.join(subset_directory, 'spk2utt.{}'.format(j)), 'w', encoding='utf8') as outf: + open(sub_path, 'w', encoding='utf8') as outf: for line in inf: line = line.split() speaker, utts = line[0], line[1:] filtered_utts = [x for x in utts if x in subset_utts] + if not filtered_utts: + continue outf.write('{} {}\n'.format(speaker, ' '.join(filtered_utts))) + subset_speakers.append(speaker) + sub_path = os.path.join(subset_directory, 'cmvn.{}.scp'.format(j)) + with open(os.path.join(split_directory, 'cmvn.{}.scp'.format(j)), 'r', encoding='utf8') as inf, \ + open(sub_path, 'w', encoding='utf8') as outf: + for line in inf: + line = line.split() + speaker, cmvn = line[0], line[1] + if speaker not in subset_speakers: + continue + outf.write('{} {}\n'.format(speaker, cmvn)) if feature_config is not None: base_path = os.path.join(split_directory, feature_config.feature_id + '.{}.scp'.format(j)) subset_scp = os.path.join(subset_directory, feature_config.feature_id + '.{}.scp'.format(j)) + if os.path.exists(subset_scp): + continue filtered = filter_scp(subset_utts, base_path) with open(subset_scp, 'w') as f: for line in filtered: diff --git a/montreal_forced_aligner/corpus/base.py b/montreal_forced_aligner/corpus/base.py index 1c4f8183..411544e3 100644 --- a/montreal_forced_aligner/corpus/base.py +++ b/montreal_forced_aligner/corpus/base.py @@ -10,66 +10,36 @@ from ..helper import thirdparty_binary, load_text, load_scp, output_mapping, save_groups, filter_scp -def get_n_channels(file_path): - """ - Return the number of channels for a sound file - - Parameters - ---------- - file_path : str - Path to a wav file - - Returns - ------- - int - Number of channels (1 if mono, 2 if stereo) - """ - +def get_wav_info(file_path): with soundfile.SoundFile(file_path, 'r') as inf: n_channels = inf.channels subtype = inf.subtype if not subtype.startswith('PCM'): raise SampleRateError('The file {} is not a PCM file.'.format(file_path)) - return n_channels - - -def get_sample_rate(file_path): - return librosa.get_samplerate(file_path) - - -def get_bit_depth(file_path): - with soundfile.SoundFile(file_path, 'r') as inf: - subtype = inf.subtype bit_depth = int(subtype.replace('PCM_', '')) - return bit_depth - + frames = inf.frames + sr = inf.samplerate + duration = frames / sr + return {'num_channels': n_channels, 'type': subtype, 'bit_depth': bit_depth, + 'sample_rate' : sr, 'duration': duration} -def get_wav_duration(file_path): - return librosa.get_duration(filename=file_path) - -def find_ext(files, ext): - """ - Finds all files with extension `ext` in `files`. - - Parameters - ---------- - files : list - List of files to search in - ext : str - File extension - - Returns - ------- - dict - A dictionary of pairs (filename, full_filename) - """ - dic = defaultdict(lambda: None) +def find_exts(files): + wav_files = {} + lab_files = {} + textgrid_files = {} for full_filename in files: filename, fext = os.path.splitext(full_filename) - if fext.lower() == ext: - dic[filename] = full_filename - return dic + fext = fext.lower() + if fext == '.wav': + wav_files[filename] = full_filename + elif fext == '.lab': + lab_files[filename] = full_filename + elif fext == '.txt' and filename not in lab_files: # .lab files have higher priority than .txt files + lab_files[filename] = full_filename + elif fext== '.textgrid': + textgrid_files[filename] = full_filename + return wav_files, lab_files, textgrid_files def extract_temp_channels(wav_path, temp_directory): @@ -140,17 +110,21 @@ class BaseCorpus(object): def __init__(self, directory, output_directory, speaker_characters=0, - num_jobs=3, debug=False): + num_jobs=3, debug=False, logger=None, use_mp=True): self.debug = debug + self.use_mp = use_mp log_dir = os.path.join(output_directory, 'logging') os.makedirs(log_dir, exist_ok=True) self.name = os.path.basename(directory) self.log_file = os.path.join(log_dir, 'corpus.log') - self.logger = logging.getLogger('corpus_setup') - self.logger.setLevel(logging.INFO) - handler = logging.FileHandler(self.log_file, 'w', 'utf-8') - handler.setFormatter = logging.Formatter('%(name)s %(message)s') - self.logger.addHandler(handler) + if logger is None: + self.logger = logging.getLogger('corpus_setup') + self.logger.setLevel(logging.INFO) + handler = logging.FileHandler(self.log_file, 'w', 'utf-8') + handler.setFormatter = logging.Formatter('%(name)s %(message)s') + self.logger.addHandler(handler) + else: + self.logger = logger if not os.path.exists(directory): raise CorpusError('The directory \'{}\' does not exist.'.format(directory)) if not os.path.isdir(directory): @@ -158,13 +132,15 @@ def __init__(self, directory, output_directory, if num_jobs < 1: num_jobs = 1 + if num_jobs == 1: + self.use_mp = False self.original_num_jobs = num_jobs - print('Setting up corpus information...') self.logger.info('Setting up corpus information...') self.directory = directory self.output_directory = os.path.join(output_directory, 'corpus_data') self.temp_directory = os.path.join(self.output_directory, 'temp') os.makedirs(self.temp_directory, exist_ok=True) + self.speaker_characters = speaker_characters if speaker_characters == 0: self.speaker_directories = True else: @@ -172,8 +148,9 @@ def __init__(self, directory, output_directory, self.num_jobs = num_jobs self.sample_rates = defaultdict(set) self.unsupported_sample_rate = [] + self.text_mapping = {} self.wav_files = [] - self.wav_durations = {} + self.wav_info = {} self.unsupported_bit_depths = [] self.wav_read_errors = [] self.speak_utt_mapping = defaultdict(list) @@ -370,14 +347,19 @@ def grouped_spk2utt(self): return output def get_wav_duration(self, utt): - if utt in self.wav_durations: - return self.wav_durations[utt] + if utt in self.wav_info: + return self.wav_info[utt][-1] if not self.segments: wav_path = self.utt_wav_mapping[utt] else: - rec = self.segments[utt].split(' ')[0] - wav_path = self.utt_wav_mapping[rec] - return get_wav_duration(wav_path) + if utt in self.utt_wav_mapping: + wav_path = self.utt_wav_mapping[utt] + else: + rec = self.segments[utt].split(' ')[0] + if rec in self.wav_info: + return self.wav_info[rec][-1] + wav_path = self.utt_wav_mapping[rec] + return get_wav_info(wav_path)['duration'] def split_directory(self): directory = os.path.join(self.output_directory, 'split{}'.format(self.num_jobs)) @@ -392,8 +374,20 @@ def _write_speak_utt(self): output_mapping(self.speak_utt_mapping, spk2utt) def _write_wavscp(self): - wavscp = os.path.join(self.output_directory, 'wav.scp') - output_mapping(self.utt_wav_mapping, wavscp) + path = os.path.join(self.output_directory, 'wav.scp') + output_mapping(self.utt_wav_mapping, path) + + def _write_speaker_sr(self): + path = os.path.join(self.output_directory, 'sr.scp') + output_mapping(self.sample_rates, path) + + def _write_wav_info(self): + path = os.path.join(self.output_directory, 'wav_info.scp') + output_mapping(self.wav_info, path) + + def _write_file_directory(self): + path = os.path.join(self.output_directory, 'file_directory.scp') + output_mapping(self.file_directory_mapping, path) def _write_segments(self): if not self.segments: @@ -451,27 +445,34 @@ def combine_feats(self): def figure_utterance_lengths(self): feat_path = os.path.join(self.output_directory, 'feats.scp') - if os.path.exists(feat_path): - with open(os.devnull, 'w') as devnull: - dim_proc = subprocess.Popen([thirdparty_binary('feat-to-len'), - 'scp:' + feat_path, 'ark,t:-'], - stdout=subprocess.PIPE, - stderr=devnull) - stdout, stderr = dim_proc.communicate() - feats = stdout.decode('utf8').strip() - for line in feats.splitlines(): - line = line.strip() - line = line.split() - self.utterance_lengths[line[0]] = int(line[1]) + lengths_path = os.path.join(self.output_directory, 'utterance_lengths.scp') + if os.path.exists(feat_path) and not self.utterance_lengths: + if os.path.exists(lengths_path): + self.utterance_lengths = load_scp(lengths_path, int) + else: + with open(os.devnull, 'w') as devnull: + dim_proc = subprocess.Popen([thirdparty_binary('feat-to-len'), + 'scp:' + feat_path, 'ark,t:-'], + stdout=subprocess.PIPE, + stderr=devnull) + stdout, stderr = dim_proc.communicate() + feats = stdout.decode('utf8').strip() + for line in feats.splitlines(): + line = line.strip() + line = line.split() + self.utterance_lengths[line[0]] = int(line[1]) + output_mapping(self.utterance_lengths, lengths_path) + def get_feat_dim(self, feature_config): - path = os.path.join(self.split_directory(), feature_config.feature_id + '.0.scp') + feature_string = feature_config.construct_feature_proc_string(self.split_directory(), None, 0) with open(os.devnull, 'w') as devnull: dim_proc = subprocess.Popen([thirdparty_binary('feat-to-dim'), - 'scp:' + path, '-'], + feature_string, '-'], stdout=subprocess.PIPE, - stderr=devnull) + #stderr=devnull + ) stdout, stderr = dim_proc.communicate() feats = stdout.decode('utf8').strip() return int(feats) @@ -480,12 +481,14 @@ def write(self): self._write_speak_utt() self._write_utt_speak() self._write_wavscp() + self._write_speaker_sr() + self._write_wav_info() + self._write_file_directory() def split(self): split_dir = self.split_directory() os.makedirs(os.path.join(split_dir, 'log'), exist_ok=True) self.logger.info('Setting up training data...') - print('Setting up corpus_data directory...') self._split_wavs(split_dir) self._split_utt2spk(split_dir) self._split_spk2utt(split_dir) diff --git a/montreal_forced_aligner/corpus/transcribe_corpus.py b/montreal_forced_aligner/corpus/transcribe_corpus.py index d38d6fd5..87b1973e 100644 --- a/montreal_forced_aligner/corpus/transcribe_corpus.py +++ b/montreal_forced_aligner/corpus/transcribe_corpus.py @@ -1,35 +1,187 @@ import os import sys import traceback +import time +from collections import defaultdict from textgrid import TextGrid, IntervalTier -from .base import BaseCorpus, get_sample_rate, get_bit_depth, find_ext, get_n_channels, extract_temp_channels +from .base import BaseCorpus, get_wav_info, find_exts, extract_temp_channels +from ..helper import save_groups, load_scp, save_scp from ..exceptions import SampleRateError, CorpusError +from ..multiprocessing import segment_vad +import multiprocessing as mp +from queue import Empty +from ..multiprocessing.helper import Stopped +from ..multiprocessing.corpus import CorpusProcessWorker, parse_transcription, parse_lab_file, parse_textgrid_file class TranscribeCorpus(BaseCorpus): def __init__(self, directory, output_directory, speaker_characters=0, - num_jobs=3, debug=False): + num_jobs=3, debug=False, logger=None, use_mp=True): super(TranscribeCorpus, self).__init__(directory, output_directory, speaker_characters, - num_jobs, debug) + num_jobs, debug, logger, use_mp) + self.vad_segments = {} + + loaded = self._load_from_temp() + if not loaded: + if self.use_mp: + self._load_from_source_mp() + else: + self._load_from_source() + self.check_warnings() + self.find_best_groupings() + + def _load_from_temp(self): + begin_time = time.time() + feat_path = os.path.join(self.output_directory, 'feats.scp') + if not os.path.exists(feat_path): + return False + cmvn_path = os.path.join(self.output_directory, 'cmvn.scp') + if not os.path.exists(cmvn_path): + return False + utt2spk_path = os.path.join(self.output_directory, 'utt2spk') + if not os.path.exists(utt2spk_path): + return False + spk2utt_path = os.path.join(self.output_directory, 'spk2utt') + if not os.path.exists(spk2utt_path): + return False + sr_path = os.path.join(self.output_directory, 'sr.scp') + if not os.path.exists(sr_path): + return False + wav_path = os.path.join(self.output_directory, 'wav.scp') + if not os.path.exists(wav_path): + return False + file_directory_path = os.path.join(self.output_directory, 'file_directory.scp') + if not os.path.exists(file_directory_path): + return False + wav_info_path = os.path.join(self.output_directory, 'wav_info.scp') + if not os.path.exists(wav_info_path): + return False + self.feat_mapping = load_scp(feat_path) + self.cmvn_mapping = load_scp(cmvn_path) + self.utt_speak_mapping = load_scp(utt2spk_path) + self.speak_utt_mapping = load_scp(spk2utt_path) + self.sample_rates = {int(k): v for k,v in load_scp(sr_path).items()} + self.utt_wav_mapping = load_scp(wav_path) + self.wav_info = load_scp(wav_info_path, float) + self.file_directory_mapping = load_scp(file_directory_path) + segments_path = os.path.join(self.output_directory, 'segments.scp') + if os.path.exists(segments_path): + self.segments = load_scp(segments_path) + speaker_ordering_path = os.path.join(self.output_directory, 'speaker_ordering.scp') + if os.path.exists(speaker_ordering_path): + self.speaker_ordering = load_scp(speaker_ordering_path) + text_path = os.path.join(self.output_directory, 'text') + if os.path.exists(text_path): + self.text_mapping = load_scp(text_path) + for utt, text in self.text_mapping.items(): + self.text_mapping[utt] = ' '.join(text) + self.logger.debug('Loaded from corpus_data temp directory in {} seconds'.format(time.time()-begin_time)) + return True + + def _load_from_source_mp(self): + begin_time = time.time() + manager = mp.Manager() + job_queue = manager.Queue() + return_queue = manager.Queue() + return_dict = manager.dict() + stopped = Stopped() + + procs = [] + for i in range(self.num_jobs): + p = CorpusProcessWorker(job_queue, return_dict, return_queue, stopped) + procs.append(p) + p.start() + for root, dirs, files in os.walk(self.directory, followlinks=True): - wav_files = find_ext(files, '.wav') - textgrid_files = find_ext(files, '.textgrid') - for f in sorted(files): - file_name, ext = os.path.splitext(f) - if ext.lower() != '.wav': - continue + wav_files, lab_files, textgrid_files = find_exts(files) + relative_path = root.replace(self.directory, '').lstrip('/').lstrip('\\') + for file_name, f in wav_files.items(): + wav_path = os.path.join(root, f) + if file_name in textgrid_files: + tg_name = textgrid_files[file_name] + transcription_path = os.path.join(root, tg_name) + else: + transcription_path = None + job_queue.put((file_name, wav_path, transcription_path, relative_path, self.speaker_characters, self.temp_directory)) + job_queue.join() + + for p in procs: + p.join() + + while True: + try: + info = return_queue.get(timeout=1) + except Empty: + break + if 'segments' not in info: # didn't have a textgrid file + utt_name = info['utt_name'] + speaker_name = info['speaker_name'] + wav_info = info['wav_info'] + sr = wav_info['sample_rate'] + if utt_name in self.utt_wav_mapping: + ind = 0 + fixed_utt_name = utt_name + while fixed_utt_name not in self.utt_wav_mapping: + ind += 1 + fixed_utt_name = utt_name + '_{}'.format(ind) + utt_name = fixed_utt_name + file_name = utt_name + self.speak_utt_mapping[speaker_name].append(utt_name) + self.utt_wav_mapping[utt_name] = info['wav_path'] + self.sample_rates[sr].add(speaker_name) + self.utt_speak_mapping[utt_name] = speaker_name + self.file_directory_mapping[utt_name] = info['relative_path'] + else: + wav_info = info['wav_info'] + sr = wav_info['sample_rate'] + file_name = info['recording_name'] + self.wav_files.append(file_name) + self.speaker_ordering[file_name] = info['speaker_ordering'] + for s in info['speaker_ordering']: + self.sample_rates[sr].add(s) + self.segments.update(info['segments']) + self.utt_wav_mapping.update(info['utt_wav_mapping']) + for utt, words in info['text_mapping'].items(): + self.text_mapping[utt] = ' '.join(words) + self.utt_speak_mapping.update(info['utt_speak_mapping']) + for speak, utts in info['speak_utt_mapping'].items(): + if speak not in self.speak_utt_mapping: + self.speak_utt_mapping[speak] = utts + else: + self.speak_utt_mapping[speak].extend(utts) + for fn in info['file_names']: + self.file_directory_mapping[fn] = info['relative_path'] + + self.wav_info[file_name] = [wav_info['num_channels'], wav_info['sample_rate'], wav_info['duration']] + + for k in ['wav_read_errors', 'unsupported_sample_rate', 'unsupported_bit_depths', + 'decode_error_files', 'textgrid_read_errors']: + if hasattr(self, k): + if k in return_dict: + if k == 'textgrid_read_errors': + getattr(self, k).update(return_dict[k]) + else: + setattr(self, k, return_dict[k]) + self.logger.debug('Parsed corpus directory with {} jobs in {} seconds'.format(self.num_jobs, time.time()-begin_time)) + + def _load_from_source(self): + for root, dirs, files in os.walk(self.directory, followlinks=True): + wav_files, lab_files, textgrid_files = find_exts(files) + for file_name, f in wav_files.items(): wav_path = os.path.join(root, f) try: - sr = get_sample_rate(wav_path) + wav_info = get_wav_info(wav_path) + sr = wav_info['sample_rate'] except Exception: self.wav_read_errors.append(wav_path) continue - bit_depth = get_bit_depth(wav_path) + bit_depth = wav_info['bit_depth'] + wav_max_time = wav_info['duration'] if bit_depth != 16: self.unsupported_bit_depths.append(wav_path) continue @@ -38,9 +190,9 @@ def __init__(self, directory, output_directory, if self.speaker_directories: speaker_name = os.path.basename(root) else: - if isinstance(speaker_characters, int): - speaker_name = f[:speaker_characters] - elif speaker_characters == 'prosodylab': + if isinstance(self.speaker_characters, int): + speaker_name = f[:self.speaker_characters] + elif self.speaker_characters == 'prosodylab': speaker_name = f.split('_')[1] else: speaker_name = f @@ -56,7 +208,7 @@ def __init__(self, directory, output_directory, utt_name = utt_name.strip().replace(' ', '_') self.utt_wav_mapping[utt_name] = wav_path self.speak_utt_mapping[speaker_name].append(utt_name) - self.sample_rates[get_sample_rate(wav_path)].add(speaker_name) + self.sample_rates[sr].add(speaker_name) self.utt_speak_mapping[utt_name] = speaker_name self.file_directory_mapping[utt_name] = root.replace(self.directory, '').lstrip('/').lstrip('\\') @@ -70,7 +222,7 @@ def __init__(self, directory, output_directory, exc_type, exc_value, exc_traceback = sys.exc_info() self.textgrid_read_errors[tg_path] = '\n'.join( traceback.format_exception(exc_type, exc_value, exc_traceback)) - n_channels = get_n_channels(wav_path) + n_channels = wav_info['num_channels'] num_tiers = len(tg.tiers) if n_channels == 2: a_name = file_name + "_A" @@ -81,9 +233,9 @@ def __init__(self, directory, output_directory, raise (Exception('More than two channels')) self.speaker_ordering[file_name] = [] if not self.speaker_directories: - if isinstance(speaker_characters, int): - speaker_name = f[:speaker_characters] - elif speaker_characters == 'prosodylab': + if isinstance(self.speaker_characters, int): + speaker_name = f[:self.speaker_characters] + elif self.speaker_characters == 'prosodylab': speaker_name = f.split('_')[1] else: speaker_name = f @@ -97,12 +249,15 @@ def __init__(self, directory, output_directory, if self.speaker_directories: speaker_name = ti.name.strip().replace(' ', '_') self.speaker_ordering[file_name].append(speaker_name) - self.sample_rates[get_sample_rate(wav_path)].add(speaker_name) + self.sample_rates[sr].add(speaker_name) for interval in ti: text = interval.mark.lower().strip() if not text: continue + begin, end = round(interval.minTime, 4), round(interval.maxTime, 4) + if end > wav_max_time: + end = wav_max_time utt_name = '{}_{}_{}_{}'.format(speaker_name, file_name, begin, end) utt_name = utt_name.strip().replace(' ', '_').replace('.', '_') if n_channels == 1: @@ -123,9 +278,11 @@ def __init__(self, directory, output_directory, self.ignored_utterances.append(utt_name) self.segments[utt_name] = '{} {} {}'.format(b_name, begin, end) self.utt_wav_mapping[b_name] = b_path + self.text_mapping[utt_name] = text self.utt_speak_mapping[utt_name] = speaker_name self.speak_utt_mapping[speaker_name].append(utt_name) + def check_warnings(self): bad_speakers = [] for speaker in self.speak_utt_mapping.keys(): count = 0 @@ -149,9 +306,8 @@ def __init__(self, directory, output_directory, 'please resample all wav files to the same sample rate.'.format(self.num_jobs) print('WARNING: ' + msg) self.logger.warning(msg) - self.find_best_groupings() - def initialize_corpus(self): + def initialize_corpus(self, dictionary=None): if not self.utt_wav_mapping: raise CorpusError('There were no wav files found for transcribing this corpus. Please validate the corpus.') split_dir = self.split_directory() @@ -159,3 +315,11 @@ def initialize_corpus(self): if not os.path.exists(split_dir): self.split() self.figure_utterance_lengths() + + def create_vad_segments(self, segmentation_config): + segment_vad(self, segmentation_config) + directory = self.split_directory() + self.vad_segments = {} + for i in range(self.num_jobs): + vad_segments_path = os.path.join(directory, 'vad_segments.{}.scp'.format(i)) + self.vad_segments.update(load_scp(vad_segments_path)) diff --git a/montreal_forced_aligner/dictionary.py b/montreal_forced_aligner/dictionary.py index fffa12ea..c6ca6cac 100644 --- a/montreal_forced_aligner/dictionary.py +++ b/montreal_forced_aligner/dictionary.py @@ -2,6 +2,8 @@ import math import subprocess import re +import logging +import sys from collections import defaultdict, Counter from .helper import thirdparty_binary @@ -134,7 +136,7 @@ class Dictionary(object): def __init__(self, input_path, output_directory, oov_code='', position_dependent_phones=True, num_sil_states=5, num_nonsil_states=3, shared_silence_phones=True, - sil_prob=0.5, word_set=None, debug=False): + sil_prob=0.5, word_set=None, debug=False, logger=None): if not os.path.exists(input_path): raise (DictionaryPathError(input_path)) if not os.path.isfile(input_path): @@ -142,11 +144,22 @@ def __init__(self, input_path, output_directory, oov_code='', self.input_path = input_path self.debug = debug self.output_directory = os.path.join(output_directory, 'dictionary') + os.makedirs(self.output_directory, exist_ok=True) + self.log_file = os.path.join(self.output_directory, 'dictionary.log') + if logger is None: + self.logger = logging.getLogger('dictionary_setup') + self.logger.setLevel(logging.INFO) + handler = logging.FileHandler(self.log_file, 'w', 'utf-8') + handler.setFormatter = logging.Formatter('%(name)s %(message)s') + self.logger.addHandler(handler) + else: + self.logger = logger self.num_sil_states = num_sil_states self.num_nonsil_states = num_nonsil_states self.shared_silence_phones = shared_silence_phones self.sil_prob = sil_prob self.oov_code = oov_code + self.sil_code = '!sil' self.oovs_found = Counter() self.position_dependent_phones = position_dependent_phones @@ -163,7 +176,7 @@ def __init__(self, input_path, output_directory, oov_code='', word_set.add(self.oov_code) self.word_set = word_set self.clitic_set = set() - self.words['!sil'].append({'pronunciation': ('sp',), 'probability': 1}) + self.words[self.sil_code].append({'pronunciation': ('sp',), 'probability': 1}) self.words[self.oov_code].append({'pronunciation': ('spn',), 'probability': 1}) self.pronunciation_probabilities, self.silence_probabilities = check_format(input_path) progress = 'Parsing dictionary' @@ -175,7 +188,7 @@ def __init__(self, input_path, output_directory, oov_code='', progress += ' with silence probabilties' else: progress += ' without silence probabilties' - print(progress) + self.logger.info(progress) with open(input_path, 'r', encoding='utf8') as inf: for i, line in enumerate(inf): line = line.strip() @@ -225,9 +238,16 @@ def __init__(self, input_path, output_directory, oov_code='', self.phone_mapping = {} self.words_mapping = {} + def set_word_set(self, word_set): + word_set = {sanitize(x) for x in word_set} + word_set.add(self.sil_code) + word_set.add(self.oov_code) + self.word_set = word_set + self.generate_mappings() + @property def actual_words(self): - return {k: v for k, v in self.words.items() if k not in ['!sil', self.oov_code]} + return {k: v for k, v in self.words.items() if k not in [self.sil_code, self.oov_code, ''] and len(v)} def split_clitics(self, item): if item in self.words: @@ -289,7 +309,6 @@ def generate_mappings(self): self.words_mapping['#0'] = i + 1 self.words_mapping[''] = i + 2 self.words_mapping[''] = i + 3 - self.oovs_found = Counter() self.add_disambiguation() @@ -336,7 +355,7 @@ def create_utterance_fst(self, text, frequent_words): text = '' for k, v in word_probs.items(): cost = -1 * math.log(v) - text += '0 0 {w} {w} {cost}\n'.format(w=self.to_int(k), cost=cost) + text += '0 0 {w} {w} {cost}\n'.format(w=self.to_int(k)[0], cost=cost) text += '0 {}\n'.format(-1 * math.log(1 / num_words)) return text @@ -345,12 +364,16 @@ def to_int(self, item): Convert a given word into its integer id """ if item == '': - return None - item = self._lookup(item) - if item not in self.words_mapping: - self.oovs_found.update([item]) - return self.oov_int - return self.words_mapping[item] + return [] + sanitized = self._lookup(item) + text_int = [] + for item in sanitized: + if item not in self.words_mapping: + self.oovs_found.update([item]) + text_int.append(self.oov_int) + else: + text_int.append(self.words_mapping[item]) + return text_int def save_oovs_found(self, directory): """ @@ -369,14 +392,12 @@ def save_oovs_found(self, directory): def _lookup(self, item): if item in self.words_mapping: - return item + return [item] sanitized = sanitize(item) if sanitized in self.words_mapping: - return sanitized - sanitized = sanitize_clitics(item) - if sanitized in self.words_mapping: - return sanitized - return item + return [sanitized] + sanitized = self.split_clitics(item) + return sanitized def check_word(self, item): if item == '': @@ -484,7 +505,7 @@ def write(self, disambig=False): """ Write the files necessary for Kaldi """ - print('Creating dictionary information...') + self.logger.info('Creating dictionary information...') os.makedirs(self.phones_dir, exist_ok=True) self.generate_mappings() self._write_graphemes() @@ -496,6 +517,7 @@ def write(self, disambig=False): self._write_word_boundaries() self._write_extra_questions() self._write_word_file() + self._write_align_lexicon() self._write_fst_text(disambig=disambig) self._write_fst_binary(disambig=disambig) # self.cleanup() @@ -554,7 +576,7 @@ def _write_word_boundaries(self): open(boundary_int_path, 'w', encoding='utf8') as intf: if self.position_dependent_phones: for p in sorted(self.phone_mapping.keys(), key=lambda x: self.phone_mapping[x]): - if p == '': + if p == '' or p.startswith('#'): continue cat = 'nonword' if p.endswith('_B'): @@ -570,11 +592,38 @@ def _write_word_boundaries(self): def _write_word_file(self): words_path = os.path.join(self.output_directory, 'words.txt') - - with open(words_path, 'w', encoding='utf8') as f: + if sys.platform == 'win32': + newline = '' + else: + newline = None + with open(words_path, 'w', encoding='utf8', newline=newline) as f: for w, i in sorted(self.words_mapping.items(), key=lambda x: x[1]): f.write('{} {}\n'.format(w, i)) + def _write_align_lexicon(self): + path = os.path.join(self.phones_dir, 'align_lexicon.int') + + with open(path, 'w', encoding='utf8') as f: + for w, i in self.words_mapping.items(): + if self.word_set is not None and w not in self.word_set: + continue + for pron in sorted(self.words[w], key=lambda x: (x['pronunciation'], x['probability'], x['disambiguation'])): + + phones = [x for x in pron['pronunciation']] + if self.position_dependent_phones: + if len(phones) == 1: + phones[0] += '_S' + else: + for i in range(len(phones)): + if i == 0: + phones[i] += '_B' + elif i == len(phones) - 1: + phones[i] += '_E' + else: + phones[i] += '_I' + p = ' '.join(str(self.phone_mapping[x]) for x in phones) + f.write('{} {} {}\n'.format(i, i, p)) + def _write_topo(self): filepath = os.path.join(self.output_directory, 'topo') sil_transp = 1 / (self.num_sil_states - 1) @@ -707,7 +756,7 @@ def _write_disambig(self): outf.write('{}\n'.format(d)) intf.write('{}\n'.format(self.phone_mapping[d])) - def _write_fst_binary(self, disambig=False): + def _write_fst_binary(self, disambig=False, self_loop=True): if disambig: lexicon_fst_path = os.path.join(self.output_directory, 'lexicon_disambig.text.fst') output_fst = os.path.join(self.output_directory, 'L_disambig.fst') diff --git a/montreal_forced_aligner/exceptions.py b/montreal_forced_aligner/exceptions.py index 9aecd066..c3ad49e6 100644 --- a/montreal_forced_aligner/exceptions.py +++ b/montreal_forced_aligner/exceptions.py @@ -48,7 +48,52 @@ class CorpusError(MFAError): pass -class SampleRateError(CorpusError): +class SampleRateMismatchError(CorpusError): + """ + Class for errors in different sample rates + """ + pass + + +class CorpusReadError(CorpusError): + """ + Class for errors in different sample rates + """ + def __init__(self, file_name): + self.file_name = file_name + + +class SampleRateError(CorpusReadError): + """ + Class for errors in different sample rates + """ + pass + + +class BitDepthError(CorpusReadError): + """ + Class for errors in different sample rates + """ + pass + + +class TextParseError(CorpusReadError): + """ + Class for errors in different sample rates + """ + pass + + +class TextGridParseError(CorpusReadError): + """ + Class for errors in different sample rates + """ + def __init__(self, file_name, error): + self.file_name = file_name + self.error = error + + +class WavReadError(CorpusReadError): """ Class for errors in different sample rates """ @@ -113,3 +158,9 @@ class G2PError(MFAError): class LMError(MFAError): pass + + +class KaldiProcessingError(MFAError): + def __init__(self, error_logs): + super(KaldiProcessingError, self).__init__('There was one or more errors when running Kaldi binaries.') + self.error_logs = error_logs \ No newline at end of file diff --git a/montreal_forced_aligner/features/config.py b/montreal_forced_aligner/features/config.py index d34c823c..9ad56684 100644 --- a/montreal_forced_aligner/features/config.py +++ b/montreal_forced_aligner/features/config.py @@ -2,7 +2,8 @@ import shutil import subprocess from ..exceptions import ConfigError -from .processing import mfcc, add_deltas, apply_cmvn, apply_lda +from .processing import mfcc, add_deltas, apply_cmvn, apply_lda, compute_vad, select_voiced, \ + compute_ivector_features, generate_spliced_features from ..helper import thirdparty_binary, load_scp, save_groups @@ -57,9 +58,10 @@ def __init__(self, directory=None): self.use_energy = False self.frame_shift = 10 self.pitch = False - self.splice_left_context = None - self.splice_right_context = None + self.splice_left_context = 3 + self.splice_right_context = 3 self.use_mp = True + self.job_specific_configuration = {} def params(self): return {'type': self.type, @@ -77,6 +79,18 @@ def set_features_to_use_lda(self): self.lda = True self.deltas = False + @property + def splice_options(self): + return {'splice_left_context': self.splice_left_context, 'splice_right_context': self.splice_right_context} + + def add_job_specific_config(self, job_name, config): + self.job_specific_configuration[job_name] = config + + def mfcc_options(self, job_name): + options = {'use_energy': self.use_energy} + options.update(self.job_specific_configuration[job_name]) + return options + def update(self, data): for k, v in data.items(): if not hasattr(self, k): @@ -119,27 +133,23 @@ def calc_cmvn(self, corpus): corpus.cmvn_mapping = load_scp(cmvn_scp) pattern = 'cmvn.{}.scp' save_groups(corpus.grouped_cmvn, split_dir, pattern) - apply_cmvn(split_dir, corpus.num_jobs, self) + #apply_cmvn(split_dir, corpus.num_jobs, self) - @property - def raw_feature_id(self): - name = 'features_{}'.format(self.type) - if self.type == 'mfcc': - name += '_cmvn' - return name + def compute_vad(self, corpus, logger=None, vad_config=None): + if logger is None: + log_func = print + else: + log_func = logger.info + split_directory = corpus.split_directory() + if os.path.exists(os.path.join(split_directory, 'vad.0.scp')): + log_func('VAD already computed, skipping!') + return + log_func('Computing VAD...') + compute_vad(split_directory, corpus.num_jobs, self.use_mp, vad_config=vad_config) @property def feature_id(self): - name = 'features_{}'.format(self.type) - if self.type == 'mfcc': - name += '_cmvn' - if self.deltas: - name += '_deltas' - elif self.lda: - name += '_lda' - if self.fmllr: - name += '_fmllr' - return name + return 'feats' @property def fmllr_path(self): @@ -149,26 +159,82 @@ def fmllr_path(self): def lda_path(self): return os.path.join(self.directory, 'lda.mat') - def generate_base_features(self, corpus): + def generate_base_features(self, corpus, logger=None, compute_cmvn=True): + if logger is None: + log_func = print + else: + log_func = logger.info split_directory = corpus.split_directory() - if not os.path.exists(os.path.join(split_directory, self.raw_feature_id + '.0.scp')): - print('Generating base features ({})...'.format(self.type)) + for job_name, config in enumerate(corpus.frequency_configs): + self.add_job_specific_config(job_name, config[1]) + feat_id = 'feats' + if not os.path.exists(os.path.join(split_directory, feat_id + '.0.scp')): + log_func('Generating base features ({})...'.format(self.type)) if self.type == 'mfcc': - mfcc(split_directory, corpus.num_jobs, self, corpus.frequency_configs) + mfcc(split_directory, corpus.num_jobs, self) corpus.combine_feats() - print('Calculating CMVN...') - self.calc_cmvn(corpus) - #corpus.parse_features_logs() + if compute_cmvn: + log_func('Calculating CMVN...') + self.calc_cmvn(corpus) - def generate_features(self, corpus, data_directory=None, overwrite=False): + def construct_feature_proc_string(self, data_directory, model_directory, job_name, splice=False, voiced=False, cmvn=True): + if self.directory is None: + self.directory = data_directory + lda_mat_path = None + fmllr_trans_path = None + if model_directory is not None: + lda_mat_path = os.path.join(model_directory, 'lda.mat') + if not os.path.exists(lda_mat_path): + lda_mat_path = None + fmllr_trans_path = os.path.join(model_directory, 'trans.{}'.format(job_name)) + if not os.path.exists(fmllr_trans_path): + fmllr_trans_path = None + if job_name is not None: + utt2spk_path = os.path.join(data_directory, 'utt2spk.{}'.format(job_name)) + cmvn_path = os.path.join(data_directory, 'cmvn.{}.scp'.format(job_name)) + feat_path = os.path.join(data_directory, 'feats.{}.scp'.format(job_name)) + vad_path = os.path.join(data_directory, 'vad.{}.scp'.format(job_name)) + else: + utt2spk_path = os.path.join(data_directory, 'utt2spk') + cmvn_path = os.path.join(data_directory, 'cmvn.scp') + feat_path = os.path.join(data_directory, 'feats.scp') + vad_path = os.path.join(data_directory, 'vad.scp') + if voiced: + feats = 'ark,s,cs:add-deltas scp:{} ark:- |'.format(feat_path) + if cmvn: + feats += ' apply-cmvn-sliding --norm-vars=false --center=true --cmn-window=300 ark:- ark:- |' + feats += ' select-voiced-frames ark:- scp,s,cs:{} ark:- |'.format(vad_path) + elif not os.path.exists(cmvn_path) and cmvn: + feats = 'ark,s,cs:add-deltas scp:{} ark:- |'.format(feat_path) + if cmvn: + feats += ' apply-cmvn-sliding --norm-vars=false --center=true --cmn-window=300 ark:- ark:- |' + else: + feats = "ark,s,cs:apply-cmvn --utt2spk=ark:{} scp:{} scp:{} ark:- |".format(utt2spk_path, cmvn_path, feat_path) + if lda_mat_path is not None: + if not os.path.exists(lda_mat_path): + raise Exception('Could not find {}'.format(lda_mat_path)) + feats += ' splice-feats --left-context={} --right-context={} ark:- ark:- |'.format(self.splice_left_context, + self.splice_right_context) + feats += " transform-feats {} ark:- ark:- |".format(lda_mat_path) + elif splice: + feats += ' splice-feats --left-context={} --right-context={} ark:- ark:- |'.format(self.splice_left_context, + self.splice_right_context) + elif self.deltas: + feats += " add-deltas ark:- ark:- |" + + if fmllr_trans_path is not None: + if not os.path.exists(fmllr_trans_path): + raise Exception('Could not find {}'.format(fmllr_trans_path)) + feats += " transform-feats --utt2spk=ark:{} ark,s,cs:{} ark:- ark:- |".format(utt2spk_path, fmllr_trans_path) + return feats + + def generate_features(self, corpus, data_directory=None, overwrite=False, logger=None, cmvn=True): if data_directory is None: data_directory = corpus.split_directory() if self.directory is None: self.directory = data_directory - if not overwrite and os.path.exists(os.path.join(data_directory, self.feature_id + '.0.scp')): + if not overwrite and os.path.exists(os.path.join(data_directory, 'feats.0.scp')): return - self.generate_base_features(corpus) - if self.deltas: - add_deltas(data_directory, corpus.num_jobs, self) - elif self.lda: - apply_lda(data_directory, corpus.num_jobs, self) + self.generate_base_features(corpus, logger=logger, compute_cmvn=cmvn) + + diff --git a/montreal_forced_aligner/features/processing.py b/montreal_forced_aligner/features/processing.py index f4a9bdac..c25d40f5 100644 --- a/montreal_forced_aligner/features/processing.py +++ b/montreal_forced_aligner/features/processing.py @@ -1,43 +1,43 @@ import subprocess import os -from ..helper import make_path_safe, thirdparty_binary, filter_scp -from ..exceptions import CorpusError +from ..helper import thirdparty_binary, make_safe from ..multiprocessing import run_mp, run_non_mp -def mfcc_func(directory, job_name, mfcc_config_path): # pragma: no cover +def mfcc_func(directory, job_name, mfcc_options): log_directory = os.path.join(directory, 'log') raw_mfcc_path = os.path.join(directory, 'raw_mfcc.{}.ark'.format(job_name)) raw_scp_path = os.path.join(directory, 'feats.{}.scp'.format(job_name)) log_path = os.path.join(log_directory, 'make_mfcc.{}.log'.format(job_name)) segment_path = os.path.join(directory, 'segments.{}'.format(job_name)) scp_path = os.path.join(directory, 'wav.{}.scp'.format(job_name)) - - with open(log_path, 'w') as f: + utt2num_frames_path = os.path.join(directory, 'utt2num_frames.{}'.format(job_name)) + mfcc_base_command = [thirdparty_binary('compute-mfcc-feats'), '--verbose=2'] + for k, v in mfcc_options.items(): + mfcc_base_command.append('--{}={}'.format(k.replace('_', '-'), make_safe(v))) + with open(log_path, 'w') as log_file: if os.path.exists(segment_path): + mfcc_base_command += ['ark:-', 'ark:-'] seg_proc = subprocess.Popen([thirdparty_binary('extract-segments'), 'scp,p:' + scp_path, segment_path, 'ark:-'], - stdout=subprocess.PIPE, stderr=f) - comp_proc = subprocess.Popen([thirdparty_binary('compute-mfcc-feats'), '--verbose=2', - '--config=' + mfcc_config_path, - 'ark:-', 'ark:-'], - stdout=subprocess.PIPE, stderr=f, stdin=seg_proc.stdout) + stdout=subprocess.PIPE, stderr=log_file) + comp_proc = subprocess.Popen(mfcc_base_command, + stdout=subprocess.PIPE, stderr=log_file, stdin=seg_proc.stdout) else: - - comp_proc = subprocess.Popen([thirdparty_binary('compute-mfcc-feats'), '--verbose=2', - '--config=' + mfcc_config_path, - 'scp,p:' + scp_path, 'ark:-'], - stdout=subprocess.PIPE, stderr=f) + mfcc_base_command += ['scp,p:' + scp_path, 'ark:-'] + comp_proc = subprocess.Popen(mfcc_base_command, + stdout=subprocess.PIPE, stderr=log_file) copy_proc = subprocess.Popen([thirdparty_binary('copy-feats'), - '--compress=true', 'ark:-', + '--compress=true', '--write-num-frames=ark,t:' + utt2num_frames_path, + 'ark:-', 'ark,scp:{},{}'.format(raw_mfcc_path, raw_scp_path)], - stdin=comp_proc.stdout, stderr=f) + stdin=comp_proc.stdout, stderr=log_file) copy_proc.wait() -def mfcc(mfcc_directory, num_jobs, feature_config, frequency_configs): +def mfcc(mfcc_directory, num_jobs, feature_config): """ Multiprocessing function that converts wav files into MFCCs @@ -63,114 +63,235 @@ def mfcc(mfcc_directory, num_jobs, feature_config, frequency_configs): If the files per speaker exceeds the number of files that are allowed to be open on the computer (for Unix-based systems) """ + log_directory = os.path.join(mfcc_directory, 'log') + os.makedirs(log_directory, exist_ok=True) - os.makedirs(os.path.join(mfcc_directory, 'log'), exist_ok=True) - paths = [] - for j, p in frequency_configs: - paths.append(feature_config.write(mfcc_directory, j, p)) - jobs = [(mfcc_directory, x, paths[x]) + jobs = [(mfcc_directory, x, feature_config.mfcc_options(x)) for x in range(num_jobs)] if feature_config.use_mp: - run_mp(mfcc_func, jobs) + run_mp(mfcc_func, jobs, log_directory) + else: + run_non_mp(mfcc_func, jobs, log_directory) + + +def compute_vad_func(directory, vad_config, job_name): + feats_path = os.path.join(directory, 'feats.{}.scp'.format(job_name)) + vad_scp_path = os.path.join(directory, 'vad.{}.scp'.format(job_name)) + with open(os.path.join(directory, 'log', 'vad.{}.log'.format(job_name)), 'w') as log_file: + vad_proc = subprocess.Popen([thirdparty_binary('compute-vad'), + '--vad-energy-mean-scale={}'.format(vad_config['energy_mean_scale']), + '--vad-energy-threshold={}'.format(vad_config['energy_threshold']), + 'scp:' + feats_path, + 'ark,t:{}'.format(vad_scp_path)], + stderr=log_file + ) + vad_proc.communicate() + + +def compute_vad(directory, num_jobs, use_mp, vad_config=None): + log_directory = os.path.join(directory, 'log') + os.makedirs(log_directory, exist_ok=True) + if vad_config is None: + vad_config = {'energy_threshold': 5.5, + 'energy_mean_scale': 0.5} + jobs = [(directory, vad_config, x) + for x in range(num_jobs)] + if use_mp: + run_mp(compute_vad_func, jobs, log_directory) else: - run_non_mp(mfcc_func, jobs) + run_non_mp(compute_vad_func, jobs, log_directory) def apply_cmvn_func(directory, job_name, config): normed_scp_path = os.path.join(directory, config.raw_feature_id + '.{}.scp'.format(job_name)) normed_ark_path = os.path.join(directory, config.raw_feature_id + '.{}.ark'.format(job_name)) - with open(os.path.join(directory, 'log', 'norm.{}.log'.format(job_name)), 'w') as logf: - utt2spkpath = os.path.join(directory, 'utt2spk.{}'.format(job_name)) - cmvnpath = os.path.join(directory, 'cmvn.{}.scp'.format(job_name)) - featspath = os.path.join(directory, 'feats.{}.scp'.format(job_name)) + with open(os.path.join(directory, 'log', 'norm.{}.log'.format(job_name)), 'w') as log_file: + utt2spk_path = os.path.join(directory, 'utt2spk.{}'.format(job_name)) + cmvn_path = os.path.join(directory, 'cmvn.{}.scp'.format(job_name)) + feats_path = os.path.join(directory, 'feats.{}.scp'.format(job_name)) if not os.path.exists(normed_scp_path): cmvn_proc = subprocess.Popen([thirdparty_binary('apply-cmvn'), - '--utt2spk=ark:' + utt2spkpath, - 'scp:' + cmvnpath, - 'scp:' + featspath, + '--utt2spk=ark:' + utt2spk_path, + 'scp:' + cmvn_path, + 'scp:' + feats_path, 'ark,scp:{},{}'.format(normed_ark_path, normed_scp_path)], - stderr=logf + stderr=log_file ) cmvn_proc.communicate() def apply_cmvn(directory, num_jobs, config): + log_directory = os.path.join(directory, 'log') + os.makedirs(log_directory, exist_ok=True) jobs = [(directory, x, config) for x in range(num_jobs)] if config.use_mp: - run_mp(apply_cmvn_func, jobs) + run_mp(apply_cmvn_func, jobs, log_directory) else: - run_non_mp(apply_cmvn_func, jobs) + run_non_mp(apply_cmvn_func, jobs, log_directory) + + +def select_voiced_func(directory, job_name, apply_cmn): + feats_path = os.path.join(directory, 'feats.{}.scp'.format(job_name)) + vad_scp_path = os.path.join(directory, 'vad.{}.scp'.format(job_name)) + voiced_scp_path = os.path.join(directory, 'feats_voiced.{}.scp'.format(job_name)) + voiced_ark_path = os.path.join(directory, 'feats_voiced.{}.ark'.format(job_name)) + with open(os.path.join(directory, 'log', 'select-voiced.{}.log'.format(job_name)), 'w') as log_file: + deltas_proc = subprocess.Popen([thirdparty_binary('add-deltas'), + 'scp:' + feats_path, + 'ark:-' + ], stdout=subprocess.PIPE, stderr=log_file) + if apply_cmn: + cmvn_proc = subprocess.Popen([thirdparty_binary('apply-cmvn-sliding'), + '--norm-vars=false', + '--center=true', + '--cmn-window=300', + 'ark:-', 'ark:-'], + stdin=deltas_proc.stdout, stdout=subprocess.PIPE, stderr=log_file) + select_proc = subprocess.Popen([thirdparty_binary('select-voiced-frames'), + 'ark:-', + 'scp,s,cs:' + vad_scp_path, + 'ark,scp:{},{}'.format(voiced_ark_path, voiced_scp_path)], + stdin=cmvn_proc.stdout, stderr=log_file) + else: + select_proc = subprocess.Popen([thirdparty_binary('select-voiced-frames'), + 'ark:-', + 'scp,s,cs:' + vad_scp_path, + 'ark,scp:{},{}'.format(voiced_ark_path, voiced_scp_path)], + stdin=deltas_proc.stdout, stderr=log_file) + select_proc.communicate() + + +def select_voiced(directory, num_jobs, config, apply_cmn=False): + log_directory = os.path.join(directory, 'log') + os.makedirs(log_directory, exist_ok=True) + jobs = [(directory, x, apply_cmn) + for x in range(num_jobs)] + if config.use_mp: + run_mp(select_voiced_func, jobs, log_directory) + else: + run_non_mp(select_voiced_func, jobs, log_directory) + + +def compute_ivector_features_func(directory, job_name, apply_cmn): + feats_path = os.path.join(directory, 'feats.{}.scp'.format(job_name)) + out_feats_scp_path = os.path.join(directory, 'feats_for_ivector.{}.scp'.format(job_name)) + out_feats_ark_path = os.path.join(directory, 'feats_for_ivector.{}.ark'.format(job_name)) + + with open(os.path.join(directory, 'log', 'cmvn_sliding.{}.log'.format(job_name)), 'w') as log_file: + if apply_cmn: + deltas_proc = subprocess.Popen([thirdparty_binary('add-deltas'), + 'scp:' + feats_path, + 'ark:-' + ], stdout=subprocess.PIPE, stderr=log_file) + + cmvn_proc = subprocess.Popen([thirdparty_binary('apply-cmvn-sliding'), + '--norm-vars=false', + '--center=true', + '--cmn-window=300', + 'ark:-', 'ark,scp:{},{}'.format(out_feats_ark_path, out_feats_scp_path)], + stdin=deltas_proc.stdout, stdout=subprocess.PIPE, stderr=log_file) + cmvn_proc.communicate() + else: + deltas_proc = subprocess.Popen([thirdparty_binary('add-deltas'), + 'scp:' + feats_path, + 'ark,scp:{},{}'.format(out_feats_ark_path, out_feats_scp_path) + ], stderr=log_file) + deltas_proc.communicate() + + +def compute_ivector_features(directory, num_jobs, config, apply_cmn=False): + log_directory = os.path.join(directory, 'log') + os.makedirs(log_directory, exist_ok=True) + jobs = [(directory, x, apply_cmn) + for x in range(num_jobs)] + if config.use_mp: + run_mp(compute_ivector_features_func, jobs, log_directory) + else: + run_non_mp(compute_ivector_features_func, jobs, log_directory) + + +def generate_spliced_features_func(directory, raw_feature_id, config, job_name): + normed_scp_path = os.path.join(directory, raw_feature_id + '.{}.scp'.format(job_name)) + spliced_feature_id = raw_feature_id + '_spliced' + ark_path = os.path.join(directory, spliced_feature_id + '.{}.ark'.format(job_name)) + scp_path = os.path.join(directory, spliced_feature_id + '.{}.scp'.format(job_name)) + log_path = os.path.join(directory, 'log', 'lda.{}.log'.format(job_name)) + with open(log_path, 'a') as log_file: + splice_feats_proc = subprocess.Popen([thirdparty_binary('splice-feats'), + '--left-context={}'.format(config['splice_left_context']), + '--right-context={}'.format(config['splice_right_context']), + 'scp:' + normed_scp_path, + 'ark,scp:{},{}'.format(ark_path, scp_path)], + stderr=log_file) + splice_feats_proc.communicate() + + +def generate_spliced_features(directory, num_jobs, config): + log_directory = os.path.join(directory, 'log') + os.makedirs(log_directory, exist_ok=True) + jobs = [(directory, config.raw_feature_id, config.splice_options, x) + for x in range(num_jobs)] + if config.use_mp: + run_mp(generate_spliced_features_func, jobs, log_directory) + else: + run_non_mp(generate_spliced_features_func, jobs, log_directory) def add_deltas_func(directory, job_name, config): normed_scp_path = os.path.join(directory, config.raw_feature_id + '.{}.scp'.format(job_name)) ark_path = os.path.join(directory, config.feature_id + '.{}.ark'.format(job_name)) scp_path = os.path.join(directory, config.feature_id + '.{}.scp'.format(job_name)) - with open(os.path.join(directory, 'log', 'add_deltas.{}.log'.format(job_name)), 'w') as logf: + with open(os.path.join(directory, 'log', 'add_deltas.{}.log'.format(job_name)), 'w') as log_file: if config.fmllr_path is not None and os.path.exists(config.fmllr_path): deltas_proc = subprocess.Popen([thirdparty_binary('add-deltas'), 'scp:' + normed_scp_path, 'ark:-'], - stderr=logf, + stderr=log_file, stdout=subprocess.PIPE) trans_proc = subprocess.Popen([thirdparty_binary('transform-feats'), 'ark:' + config.fmllr_path, 'ark:-', 'ark,scp:{},{}'.format(ark_path, scp_path)], stdin=deltas_proc.stdout, - stderr=logf) + stderr=log_file) trans_proc.communicate() else: deltas_proc = subprocess.Popen([thirdparty_binary('add-deltas'), 'scp:' + normed_scp_path, 'ark,scp:{},{}'.format(ark_path, scp_path)], - stderr=logf) + stderr=log_file) deltas_proc.communicate() def add_deltas(directory, num_jobs, config): + log_directory = os.path.join(directory, 'log') + os.makedirs(log_directory, exist_ok=True) jobs = [(directory, x, config) for x in range(num_jobs)] if config.use_mp: - run_mp(add_deltas_func, jobs) + run_mp(add_deltas_func, jobs, log_directory) else: - run_non_mp(add_deltas_func, jobs) + run_non_mp(add_deltas_func, jobs, log_directory) -def apply_lda_func(directory, job_name, config): - normed_scp_path = os.path.join(directory, config.raw_feature_id + '.{}.scp'.format(job_name)) - ark_path = os.path.join(directory, config.feature_id + '.{}.ark'.format(job_name)) - scp_path = os.path.join(directory, config.feature_id + '.{}.scp'.format(job_name)) +def apply_lda_func(directory, spliced_feature_id, feature_id, lda_path, job_name): + normed_scp_path = os.path.join(directory, spliced_feature_id + '.{}.scp'.format(job_name)) + ark_path = os.path.join(directory, feature_id + '.{}.ark'.format(job_name)) + scp_path = os.path.join(directory, feature_id + '.{}.scp'.format(job_name)) log_path = os.path.join(directory, 'log', 'lda.{}.log'.format(job_name)) - with open(log_path, 'a') as logf: - if os.path.exists(config.lda_path): - splice_feats_proc = subprocess.Popen([thirdparty_binary('splice-feats'), - '--left-context={}'.format(config.splice_left_context), - '--right-context={}'.format(config.splice_right_context), - 'scp:' + normed_scp_path, - 'ark:-'], - stdout=subprocess.PIPE, - stderr=logf) - transform_feats_proc = subprocess.Popen([thirdparty_binary("transform-feats"), - config.lda_path, - 'ark:-', - 'ark,scp:{},{}'.format(ark_path, scp_path)], - stdin=splice_feats_proc.stdout, - stderr=logf) - transform_feats_proc.communicate() - else: - logf.write('could not find "{}"\n'.format(config.lda_path)) - splice_feats_proc = subprocess.Popen([thirdparty_binary('splice-feats'), - '--left-context={}'.format(config.splice_left_context), - '--right-context={}'.format(config.splice_right_context), - 'scp:' + normed_scp_path, - 'ark,scp:{},{}'.format(ark_path, scp_path)], - stderr=logf) - splice_feats_proc.communicate() + with open(log_path, 'a') as log_file: + transform_feats_proc = subprocess.Popen([thirdparty_binary("transform-feats"), + lda_path, + 'scp:'+ normed_scp_path, + 'ark,scp:{},{}'.format(ark_path, scp_path)], + stderr=log_file) + transform_feats_proc.communicate() def apply_lda(directory, num_jobs, config): - jobs = [(directory, x, config) + log_directory = os.path.join(directory, 'log') + os.makedirs(log_directory, exist_ok=True) + jobs = [(directory, config.spliced_feature_id, config.feature_id, config.lda_path, x) for x in range(num_jobs)] - if config.use_mp: - run_mp(apply_lda_func, jobs) + if config.use_mp and False: # Looks to be threaded + run_mp(apply_lda_func, jobs, log_directory) else: - run_non_mp(apply_lda_func, jobs) + run_non_mp(apply_lda_func, jobs, log_directory) diff --git a/montreal_forced_aligner/g2p/generator.py b/montreal_forced_aligner/g2p/generator.py index a750fc79..b627641e 100644 --- a/montreal_forced_aligner/g2p/generator.py +++ b/montreal_forced_aligner/g2p/generator.py @@ -92,7 +92,7 @@ def run(self): self.return_dict[word] = rep except Exception as e: self.stopped.stop() - self.return_dict['error'] = word, Exception(traceback.format_exception(*sys.exc_info())) + self.return_dict['MFA_EXCEPTION'] = word, Exception(traceback.format_exception(*sys.exc_info())) self.counter.increment() return @@ -145,7 +145,7 @@ def generate(self): job_queue = mp.JoinableQueue(100) ind = 0 num_words = len(self.words) - words = sorted(self.words) + words = self.words begin = time.time() last_value = 0 missing_graphemes = set() @@ -200,13 +200,15 @@ def generate(self): job_queue.join() for p in procs: p.join() - if 'error' in return_dict: - element, exc = return_dict['error'] + if 'MFA_EXCEPTION' in return_dict: + element, exc = return_dict['MFA_EXCEPTION'] print(element) raise exc - to_return.update(return_dict) - print('Processed {} in {} seconds'.format(len(self.words), time.time()-begin)) - self.logger.debug('Processed {} in {} seconds'.format(len(self.words), time.time()-begin)) + for w in self.words: + if w in return_dict: + to_return[w] = return_dict[w] + print('Processed {} in {} seconds'.format(num_words, time.time()-begin)) + self.logger.debug('Processed {} in {} seconds'.format(num_words, time.time()-begin)) return to_return def output(self, outfile): diff --git a/montreal_forced_aligner/g2p/trainer.py b/montreal_forced_aligner/g2p/trainer.py index 15f6e47b..31e183ec 100644 --- a/montreal_forced_aligner/g2p/trainer.py +++ b/montreal_forced_aligner/g2p/trainer.py @@ -58,6 +58,7 @@ def compute_validation_errors(gold_values, hypothesis_values, num_jobs=3): with mp.Pool(num_jobs) as pool: to_comp = [] for word, hyp in hypothesis_values.items(): + print(word, gold_values[word]) g = gold_values[word][0]['pronunciation'] h = hyp.split(' ') to_comp.append((g, h)) @@ -72,6 +73,7 @@ def compute_validation_errors(gold_values, hypothesis_values, num_jobs=3): for w, gold in gold_values.items(): if w not in hypothesis_values: incorrect += 1 + print(w, gold) gold = gold[0]['pronunciation'] total_edits += len(gold) total_length += len(gold) diff --git a/montreal_forced_aligner/helper.py b/montreal_forced_aligner/helper.py index e2490344..7cc3c560 100644 --- a/montreal_forced_aligner/helper.py +++ b/montreal_forced_aligner/helper.py @@ -2,7 +2,11 @@ import shutil import numpy from typing import Any, List, Tuple -from .exceptions import ThirdpartyError +import logging +import sys + +from .exceptions import ThirdpartyError, KaldiProcessingError + Labels = List[Any] @@ -33,13 +37,17 @@ def output_mapping(mapping, path): with open(path, 'w', encoding='utf8') as f: for k in sorted(mapping.keys()): v = mapping[k] - if isinstance(v, list): - v = ' '.join(v) + if isinstance(v, (list, set, tuple)): + v = ' '.join(map(str, v)) f.write('{} {}\n'.format(k, v)) def save_scp(scp, path, sort=True, multiline=False): - with open(path, 'w', encoding='utf8') as f: + if sys.platform == 'win32': + newline = '' + else: + newline = None + with open(path, 'w', encoding='utf8', newline=newline) as f: if sort: scp = sorted(scp) for line in scp: @@ -55,7 +63,7 @@ def save_groups(groups, seg_dir, pattern, multiline=False): save_scp(g, path, multiline=multiline) -def load_scp(path): +def load_scp(path, data_type=str): """ Load a Kaldi script file (.scp) @@ -65,6 +73,8 @@ def load_scp(path): ---------- path : str Path to Kaldi script file + data_type : type + Type to coerce the data to Returns ------- @@ -84,7 +94,7 @@ def load_scp(path): if len(line_list) == 1: value = line_list[0] else: - value = line_list + value = [ data_type(x) for x in line_list if x not in ['[', ']']] scp[key] = value return scp @@ -143,3 +153,54 @@ def score(args: Tuple[Labels, Labels]) -> Tuple[int, int]: """Computes sufficient statistics for LER calculation.""" edits = edit_distance(gold, hypo) return edits, len(gold) + + +def setup_logger(identifier, output_directory): + os.makedirs(output_directory, exist_ok=True) + log_path = os.path.join(output_directory, identifier + '.log') + if os.path.exists(log_path): + os.remove(log_path) + print(log_path) + logger = logging.getLogger(identifier) + logger.setLevel(logging.DEBUG) + + handler = logging.FileHandler(log_path) + handler.setLevel(logging.DEBUG) + formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') + handler.setFormatter(formatter) + logger.addHandler(handler) + + handler = logging.StreamHandler(sys.stdout) + handler.setLevel(logging.INFO) + formatter = logging.Formatter('%(levelname)s - %(message)s') + handler.setFormatter(formatter) + logger.addHandler(handler) + + return logger + + +def parse_logs(log_directory): + error_logs = [] + for name in os.listdir(log_directory): + log_path = os.path.join(log_directory, name) + with open(log_path, 'r', encoding='utf8') as f: + for line in f: + line = line.strip() + if 'error while loading shared libraries: libopenblas.so.0' in line: + raise ThirdpartyError('There was a problem locating libopenblas.so.0. ' + 'Try installing openblas via system package manager?') + if line.startswith('ERROR') or line.startswith('ASSERTION_FAILED'): + error_logs.append(log_path) + break + if error_logs: + raise KaldiProcessingError(error_logs) + + +def log_kaldi_errors(error_logs, logger): + logger.debug('There were {} kaldi processing files that had errors:'.format(len(error_logs))) + for path in error_logs: + logger.debug('') + logger.debug(path) + with open(path, 'r', encoding='utf8') as f: + for line in f: + logger.debug('\t' + line.strip()) \ No newline at end of file diff --git a/montreal_forced_aligner/lm/trainer.py b/montreal_forced_aligner/lm/trainer.py index f67ac5b5..2b8bbe03 100644 --- a/montreal_forced_aligner/lm/trainer.py +++ b/montreal_forced_aligner/lm/trainer.py @@ -1,9 +1,7 @@ import os import subprocess -import random from ..models import LanguageModel from ..corpus import AlignableCorpus -from ..helper import thirdparty_binary from ..exceptions import LMError from ..config import TEMP_DIR @@ -35,6 +33,7 @@ def __init__(self, source, config, output_model_path, dictionary=None, temp_dire supplemental_model_path=None, supplemental_model_weight=1): if not temp_directory: temp_directory = TEMP_DIR + self.models_temp_dir = os.path.join(temp_directory, 'models', 'LM') temp_directory = os.path.join(temp_directory, 'LM') self.name, _ = os.path.splitext(os.path.basename(output_model_path)) @@ -48,8 +47,18 @@ def __init__(self, source, config, output_model_path, dictionary=None, temp_dire self.source_model_weight = 1 self.supplemental_model_weight = supplemental_model_weight + @property + def meta(self): + from .. import __version__ + return {'type': 'ngram', + 'order': self.config.order, + 'method': self.config.method, + 'prune': self.config.prune, + 'version': __version__} + def train(self): mod_path = os.path.join(self.temp_directory, self.name + '.mod') + large_model_path = os.path.join(self.temp_directory, self.name + '.arpa') if isinstance(self.source, AlignableCorpus): sym_path = os.path.join(self.temp_directory, self.name + '.sym') far_path = os.path.join(self.temp_directory, self.name + '.far') @@ -82,7 +91,13 @@ def train(self): mod_path, supplemental_path, merged_path]) mod_path = merged_path - subprocess.call(['ngramprint', '--ARPA', mod_path, self.output_model_path]) + subprocess.call(['ngramprint', '--ARPA', mod_path, large_model_path]) + + directory, filename = os.path.split(self.output_model_path) + basename, _ = os.path.splitext(filename) + model = LanguageModel.empty(basename, root_directory=self.models_temp_dir) + model.add_meta_file(self) + model.add_arpa_file(large_model_path) if self.config.prune: small_mod_path = mod_path.replace('.mod', '_small.mod') @@ -97,6 +112,11 @@ def train(self): med_output_path = self.output_model_path.replace('.arpa', '_med.arpa') subprocess.call(['ngramprint', '--ARPA', small_mod_path, small_output_path]) subprocess.call(['ngramprint', '--ARPA', med_mod_path, med_output_path]) + model.add_arpa_file(med_output_path) + model.add_arpa_file(small_output_path) + basename, _ = os.path.splitext(self.output_model_path) + model.dump(basename) + model.clean_up() diff --git a/montreal_forced_aligner/models.py b/montreal_forced_aligner/models.py index 5f85a5ea..cfbd9557 100644 --- a/montreal_forced_aligner/models.py +++ b/montreal_forced_aligner/models.py @@ -6,7 +6,6 @@ from . import __version__ from .exceptions import PronunciationAcousticMismatchError - # default format for output FORMAT = "zip" @@ -38,6 +37,18 @@ def __init__(self, source, root_directory=None): os.makedirs(root_directory, exist_ok=True) unpack_archive(source, base) + @property + def meta(self): + if not self._meta: + meta_path = os.path.join(self.dirname, 'meta.yaml') + with open(meta_path, 'r', encoding='utf8') as f: + self._meta = yaml.load(f, Loader=yaml.SafeLoader) + return self._meta + + def add_meta_file(self, trainer): + with open(os.path.join(self.dirname, 'meta.yaml'), 'w', encoding='utf8') as f: + yaml.dump(trainer.meta, f) + @classmethod def empty(cls, head, root_directory=None): """ @@ -74,6 +85,7 @@ def dump(self, sink, archive_fmt=FORMAT): class AcousticModel(Archive): + files = ['final.mdl', 'final.occs', 'lda.mat', 'tree'] def add_meta_file(self, aligner): with open(os.path.join(self.dirname, 'meta.yaml'), 'w', encoding='utf8') as f: yaml.dump(aligner.meta, f) @@ -112,30 +124,21 @@ def meta(self): self._meta['phones'] = set(self._meta.get('phones', [])) return self._meta - def add_lda_matrix(self, source): - copyfile(os.path.join(source, 'lda.mat'), os.path.join(self.dirname, 'lda.mat')) - - def add_ivector_model(self, source): - copyfile(os.path.join(source, 'final.ie'), os.path.join(self.dirname, 'final.ie')) - copyfile(os.path.join(source, 'final.dubm'), os.path.join(self.dirname, 'final.dubm')) - def add_model(self, source): """ Add file into archive """ - copyfile(os.path.join(source, 'final.mdl'), os.path.join(self.dirname, 'final.mdl')) - if os.path.exists(os.path.join(source, 'final.occs')): - copyfile(os.path.join(source, 'final.occs'), os.path.join(self.dirname, 'final.occs')) - copyfile(os.path.join(source, 'tree'), os.path.join(self.dirname, 'tree')) + for f in self.files: + if os.path.exists(os.path.join(source, f)): + copyfile(os.path.join(source, f), os.path.join(self.dirname, f)) def export_model(self, destination): """ """ os.makedirs(destination, exist_ok=True) - copyfile(os.path.join(self.dirname, 'final.mdl'), os.path.join(destination, 'final.mdl')) - if os.path.exists(os.path.join(self.dirname, 'final.occs')): - copyfile(os.path.join(self.dirname, 'final.occs'), os.path.join(destination, 'final.occs')) - copyfile(os.path.join(self.dirname, 'tree'), os.path.join(destination, 'tree')) + for f in self.files: + if os.path.exists(os.path.join(self.dirname, f)): + copyfile(os.path.join(self.dirname, f), os.path.join(destination, f)) def validate(self, dictionary): if isinstance(dictionary, G2PModel): @@ -148,6 +151,36 @@ def validate(self, dictionary): raise (PronunciationAcousticMismatchError(missing_phones)) +class IvectorExtractor(Archive): + """ + Archive for i-vector extractors + """ + model_files = ['final.ie', 'final.ubm', 'final.dubm', 'plda', 'mean.vec', 'trans.mat', + 'speaker_classifier.mdl', 'speaker_labels.txt'] + + def add_model(self, source): + """ + Add file into archive + """ + for filename in self.model_files: + if os.path.exists(os.path.join(source, filename)): + copyfile(os.path.join(source, filename), os.path.join(self.dirname, filename)) + + def export_model(self, destination): + """ + """ + os.makedirs(destination, exist_ok=True) + for filename in self.model_files: + if os.path.exists(os.path.join(self.dirname, filename)): + copyfile(os.path.join(self.dirname, filename), os.path.join(destination, filename)) + + @property + def feature_config(self): + from .features.config import FeatureConfig + fc = FeatureConfig(self.dirname) + fc.update(self.meta['features']) + return fc + class G2PModel(Archive): def add_meta_file(self, dictionary, architecture): @@ -171,7 +204,6 @@ def meta(self): self._meta = yaml.load(f, Loader=yaml.SafeLoader) self._meta['phones'] = set(self._meta.get('phones', [])) self._meta['graphemes'] = set(self._meta.get('graphemes', [])) - print(self._meta) return self._meta @property @@ -213,33 +245,6 @@ def validate(self, word_list): return True -class IvectorExtractor(Archive): - """ - Archive for i-vector extractors - """ - def add_meta_file(self, trainer): - with open(os.path.join(self.dirname, 'meta.yaml'), 'w', encoding='utf8') as f: - yaml.dump(trainer.meta, f) - - @property - def meta(self): - if not self._meta: - meta_path = os.path.join(self.dirname, 'meta.yaml') - with open(meta_path, 'r', encoding='utf8') as f: - self._meta = yaml.load(f, Loader=yaml.SafeLoader) - return self._meta - - def add_model(self, source): - """ - Add file into archive - """ - copyfile(os.path.join(source, 'final.ie'), os.path.join(self.dirname, 'final.ie')) - copyfile(os.path.join(source, 'final.dubm'), os.path.join(self.dirname, 'final.dubm')) - lda_path = os.path.join(source, 'lda.mat') - if os.path.exists(lda_path): - copyfile(lda_path, os.path.join(self.dirname, 'lda.mat')) - - class LanguageModel(Archive): extension = '.arpa' @@ -266,5 +271,9 @@ def __init__(self, source, root_directory=None): @property def arpa_path(self): + print(os.listdir(self.dirname)) return os.path.join(self.dirname, self.name + self.extension) + def add_arpa_file(self, arpa_path): + name = os.path.basename(arpa_path) + copyfile(arpa_path, os.path.join(self.dirname, name)) diff --git a/montreal_forced_aligner/multiprocessing.py b/montreal_forced_aligner/multiprocessing.py deleted file mode 100644 index eda6cc19..00000000 --- a/montreal_forced_aligner/multiprocessing.py +++ /dev/null @@ -1,1877 +0,0 @@ -import multiprocessing as mp -import subprocess -import os -import shutil -import re -import time -from decimal import Decimal -import statistics - -from .helper import make_path_safe, thirdparty_binary - -from .textgrid import ctm_to_textgrid, parse_ctm - -from .exceptions import AlignmentError - - -class Counter(object): - def __init__(self, initval=0): - self.val = mp.Value('i', initval) - self.lock = mp.Lock() - - def increment(self): - with self.lock: - self.val.value += 1 - - def value(self): - with self.lock: - return self.val.value - - -class Stopped(object): - def __init__(self, initval=False): - self.val = mp.Value('i', initval) - self.lock = mp.Lock() - - def stop(self): - with self.lock: - self.val.value = True - - def stop_check(self): - with self.lock: - return self.val.value - - -class ProcessWorker(mp.Process): - def __init__(self, function, arguments): - mp.Process.__init__(self) - self.function = function - self.arguments = arguments - - def run(self): - time.sleep(10) - try: - print(self.arguments) - _ = self.function(*self.arguments) - except Exception as e: - print(e) - - -def run_non_mp(function, argument_list): - for args in argument_list: - function(*args) - - -def run_mp(function, argument_list): - with mp.get_context("spawn").Pool(processes=len(argument_list)) as p: - results = p.starmap(function, argument_list, chunksize=1) - - -def acc_stats_func(directory, iteration, job_name, feat_path): - log_path = os.path.join(directory, 'log', 'acc.{}.{}.log'.format(iteration, job_name)) - model_path = os.path.join(directory, '{}.mdl'.format(iteration)) - acc_path = os.path.join(directory, '{}.{}.acc'.format(iteration, job_name)) - ali_path = os.path.join(directory, 'ali.{}'.format(job_name)) - with open(log_path, 'w', encoding='utf8') as logf: - acc_proc = subprocess.Popen([thirdparty_binary('gmm-acc-stats-ali'), model_path, - "scp:" + feat_path, "ark,t:" + ali_path, acc_path], - stderr=logf) - acc_proc.communicate() - - -def acc_stats(iteration, directory, split_directory, num_jobs, config): - """ - Multiprocessing function that computes stats for GMM training - - See http://kaldi-asr.org/doc/gmm-acc-stats-ali_8cc.html for more details - on the Kaldi binary this runs. - - Also see https://github.com/kaldi-asr/kaldi/blob/master/egs/wsj/s5/steps/train_mono.sh - for the bash script this function was extracted from - - Parameters - ---------- - iteration : int - Iteration to calculate stats for - directory : str - Directory of training (monophone, triphone, speaker-adapted triphone - training directories) - split_directory : str - Directory of training data split into the number of jobs - num_jobs : int - The number of processes to use in calculation - """ - feat_name = config.feature_file_base_name - - feat_name += '.{}.scp' - - jobs = [(directory, iteration, x, os.path.join(split_directory, feat_name.format(x))) - for x in range(num_jobs)] - if config.use_mp: - run_mp(acc_stats_func, jobs) - else: - run_non_mp(acc_stats_func, jobs) - - -def parse_transitions(path, phones_path): - state_extract_pattern = re.compile(r'Transition-state (\d+): phone = (\w+)') - id_extract_pattern = re.compile(r'Transition-id = (\d+)') - cur_phone = None - current = 0 - with open(path, encoding='utf8') as f, open(phones_path, 'w', encoding='utf8') as outf: - outf.write('{} {}\n'.format('', 0)) - for line in f: - line = line.strip() - if line.startswith('Transition-state'): - m = state_extract_pattern.match(line) - _, phone = m.groups() - if phone != cur_phone: - current = 0 - cur_phone = phone - else: - m = id_extract_pattern.match(line) - id = m.groups()[0] - outf.write('{}_{} {}\n'.format(phone, current, id)) - current += 1 - - -def compile_train_graphs_func(directory, lang_directory, split_directory, job_name, debug=True): - fst_path = os.path.join(directory, 'fsts.{}'.format(job_name)) - tree_path = os.path.join(directory, 'tree') - mdl_path = os.path.join(directory, '0.mdl') - if not os.path.exists(mdl_path): - mdl_path = os.path.join(directory, 'final.mdl') - - log_path = os.path.join(directory, 'log', 'show_transition.log') - transition_path = os.path.join(directory, 'transitions.txt') - phones_file_path = os.path.join(lang_directory, 'phones.txt') - - triphones_file_path = os.path.join(directory, 'triphones.txt') - if debug: - with open(log_path, 'w', encoding='utf8') as logf: - with open(transition_path, 'w', encoding='utf8') as f: - subprocess.call([thirdparty_binary('show-transitions'), phones_file_path, mdl_path], - stdout=f, stderr=logf) - parse_transitions(transition_path, triphones_file_path) - log_path = os.path.join(directory, 'log', 'compile-graphs.0.{}.log'.format(job_name)) - - if os.path.exists(triphones_file_path): - phones_file_path = triphones_file_path - words_file_path = os.path.join(lang_directory, 'words.txt') - - with open(os.path.join(split_directory, 'text.{}.int'.format(job_name)), 'r', encoding='utf8') as inf, \ - open(fst_path, 'wb') as outf, \ - open(log_path, 'w', encoding='utf8') as logf: - proc = subprocess.Popen([thirdparty_binary('compile-train-graphs'), - '--read-disambig-syms={}'.format( - os.path.join(lang_directory, 'phones', 'disambig.int')), - tree_path, mdl_path, - os.path.join(lang_directory, 'L.fst'), - "ark:-", "ark:-"], - stdin=inf, stdout=outf, stderr=logf) - proc.communicate() - - if debug: - utterances = [] - with open(os.path.join(split_directory, 'utt2spk.{}'.format(job_name)), 'r', encoding='utf8') as f: - for line in f: - utt = line.split()[0].strip() - if not utt: - continue - utterances.append(utt) - - with open(log_path, 'a', encoding='utf8') as logf: - fst_ark_path = os.path.join(directory, 'fsts.{}.ark'.format(job_name)) - fst_scp_path = os.path.join(directory, 'fsts.{}.scp'.format(job_name)) - proc = subprocess.Popen([thirdparty_binary('fstcopy'), - 'ark:{}'.format(fst_path), - 'ark,scp:{},{}'.format(fst_ark_path, fst_scp_path)], stderr=logf) - proc.communicate() - - temp_fst_path = os.path.join(directory, 'temp.fst.{}'.format(job_name)) - - with open(fst_scp_path, 'r', encoding='utf8') as f: - for line in f: - line = line.strip() - utt = line.split()[0] - - dot_path = os.path.join(directory, '{}.dot'.format(utt)) - fst_proc = subprocess.Popen([thirdparty_binary('fstcopy'), - 'scp:-', - 'scp:echo {} {}|'.format(utt, temp_fst_path)], - stdin=subprocess.PIPE, stderr=logf) - fst_proc.communicate(input=line.encode()) - - draw_proc = subprocess.Popen([thirdparty_binary('fstdraw'), '--portrait=true', - '--isymbols={}'.format(phones_file_path), - '--osymbols={}'.format(words_file_path), temp_fst_path, dot_path], - stderr=logf) - draw_proc.communicate() - try: - dot_proc = subprocess.Popen([thirdparty_binary('dot'), '-Tpdf', '-O', dot_path], stderr=logf) - dot_proc.communicate() - except FileNotFoundError: - pass - - -def compile_train_graphs(directory, lang_directory, split_directory, num_jobs, config, debug=False): - """ - Multiprocessing function that compiles training graphs for utterances - - See http://kaldi-asr.org/doc/compile-train-graphs_8cc.html for more details - on the Kaldi binary this function calls. - - Also see https://github.com/kaldi-asr/kaldi/blob/master/egs/wsj/s5/steps/train_mono.sh - for the bash script that this function was extracted from. - - Parameters - ---------- - directory : str - Directory of training (monophone, triphone, speaker-adapted triphone - training directories) - lang_directory : str - Directory of the language model used - split_directory : str - Directory of training data split into the number of jobs - num_jobs : int - The number of processes to use - """ - os.makedirs(os.path.join(directory, 'log'), exist_ok=True) - jobs = [(directory, lang_directory, split_directory, x, debug) - for x in range(num_jobs)] - if config.use_mp: - run_mp(compile_train_graphs_func, jobs) - else: - run_non_mp(compile_train_graphs_func, jobs) - - -def mono_align_equal_func(mono_directory, job_name, feat_path): - fst_path = os.path.join(mono_directory, 'fsts.{}'.format(job_name)) - mdl_path = os.path.join(mono_directory, '0.mdl') - log_path = os.path.join(mono_directory, 'log', 'align.0.{}.log'.format(job_name)) - ali_path = os.path.join(mono_directory, 'ali.{}'.format(job_name)) - acc_path = os.path.join(mono_directory, '0.{}.acc'.format(job_name)) - with open(log_path, 'w', encoding='utf8') as logf: - align_proc = subprocess.Popen([thirdparty_binary('align-equal-compiled'), "ark:" + fst_path, - 'scp:' + feat_path, 'ark:' + ali_path], - stderr=logf) - align_proc.communicate() - stats_proc = subprocess.Popen([thirdparty_binary('gmm-acc-stats-ali'), '--binary=true', - mdl_path, 'scp:' + feat_path, 'ark:' + ali_path, acc_path], - stdin=align_proc.stdout, stderr=logf) - stats_proc.communicate() - - -def mono_align_equal(mono_directory, split_directory, num_jobs, config): - """ - Multiprocessing function that creates equal alignments for base monophone training - - See http://kaldi-asr.org/doc/align-equal-compiled_8cc.html for more details - on the Kaldi binary this function calls. - - Also see https://github.com/kaldi-asr/kaldi/blob/master/egs/wsj/s5/steps/train_mono.sh - for the bash script that this function was extracted from. - - Parameters - ---------- - mono_directory : str - Directory of monophone training - split_directory : str - Directory of training data split into the number of jobs - num_jobs : int - The number of processes to use - """ - jobs = [(mono_directory, x, - os.path.join(split_directory, config.feature_file_base_name + '.{}.scp'.format(x))) - for x in range(num_jobs)] - - if config.use_mp: - run_mp(mono_align_equal_func, jobs) - else: - run_non_mp(mono_align_equal_func, jobs) - - -def align_func(directory, iteration, job_name, mdl, config, feat_path, output_directory): - fst_path = os.path.join(directory, 'fsts.{}'.format(job_name)) - log_path = os.path.join(output_directory, 'log', 'align.{}.{}.log'.format(iteration, job_name)) - ali_path = os.path.join(output_directory, 'ali.{}'.format(job_name)) - score_path = os.path.join(output_directory, 'ali.{}.scores'.format(job_name)) - with open(log_path, 'w', encoding='utf8') as logf: - align_proc = subprocess.Popen([thirdparty_binary('gmm-align-compiled'), - '--transition-scale={}'.format(config.transition_scale), - '--acoustic-scale={}'.format(config.acoustic_scale), - '--self-loop-scale={}'.format(config.self_loop_scale), - '--beam={}'.format(config.beam), - '--retry-beam={}'.format(config.retry_beam), - '--careful=false', - mdl, - "ark:" + fst_path, "scp:" + feat_path, "ark,t:" + ali_path, - "ark,t:" + score_path], - stderr=logf) - align_proc.communicate() - - -def align(iteration, directory, split_directory, optional_silence, num_jobs, config, output_directory=None): - """ - Multiprocessing function that aligns based on the current model - - See http://kaldi-asr.org/doc/gmm-align-compiled_8cc.html and - http://kaldi-asr.org/doc/gmm-boost-silence_8cc.html for more details - on the Kaldi binary this function calls. - - Also see https://github.com/kaldi-asr/kaldi/blob/master/egs/wsj/s5/steps/align_si.sh - for the bash script this function was based on. - - Parameters - ---------- - iteration : int or str - Iteration to align - directory : str - Directory of training (monophone, triphone, speaker-adapted triphone - training directories) - split_directory : str - Directory of training data split into the number of jobs - optional_silence : str - Colon-separated list of silence phones to boost - num_jobs : int - The number of processes to use in calculation - config : :class:`~aligner.config.MonophoneConfig`, :class:`~aligner.config.TriphoneConfig` or :class:`~aligner.config.TriphoneFmllrConfig` - Configuration object for training - """ - if output_directory is None: - output_directory = directory - mdl_path = os.path.join(directory, '{}.mdl'.format(iteration)) - mdl = "{} --boost={} {} {} - |".format(thirdparty_binary('gmm-boost-silence'), - config.boost_silence, optional_silence, make_path_safe(mdl_path)) - feat_name = config.feature_file_base_name - feat_name += '.{}.scp' - jobs = [(directory, iteration, x, mdl, config, os.path.join(split_directory, feat_name.format(x)), output_directory) - for x in range(num_jobs)] - - if config.use_mp: - run_mp(align_func, jobs) - else: - run_non_mp(align_func, jobs) - - error_logs = [] - for i in range(num_jobs): - log_path = os.path.join(output_directory, 'log', 'align.{}.{}.log'.format(iteration, i)) - with open(log_path, 'r', encoding='utf8') as f: - for line in f: - if line.strip().startswith('ERROR'): - error_logs.append(log_path) - break - if error_logs: - message = 'There were {} job(s) with errors. For more information, please see the following logs:\n\n{}' - raise (AlignmentError(message.format(len(error_logs), '\n'.join(error_logs)))) - - -def decode_func(directory, job_name, mdl, config, feat_path, output_directory, num_threads=None): - log_path = os.path.join(output_directory, 'log', 'decode.{}.log'.format(job_name)) - lat_path = os.path.join(output_directory, 'lat.{}'.format(job_name)) - if os.path.exists(lat_path): - return - word_symbol_path = os.path.join(directory, 'words.txt') - hclg_path = os.path.join(directory, 'HCLG.fst') - if config.fmllr and config.first_beam is not None: - beam = config.first_beam - else: - beam = config.beam - if config.fmllr and config.first_max_active is not None: - max_active = config.first_max_active - else: - max_active = config.max_active - with open(log_path, 'w', encoding='utf8') as logf: - if num_threads is None: - decode_proc = subprocess.Popen([thirdparty_binary('gmm-latgen-faster'), - '--max-active={}'.format(max_active), - '--beam={}'.format(beam), - '--lattice-beam={}'.format(config.lattice_beam), - '--allow-partial=true', - '--word-symbol-table={}'.format(word_symbol_path), - '--acoustic-scale={}'.format(config.acoustic_scale), - mdl, hclg_path, "scp:" + feat_path, - "ark:" + lat_path], - stderr=logf) - else: - decode_proc = subprocess.Popen([thirdparty_binary('gmm-latgen-faster-parallel'), - '--max-active={}'.format(max_active), - '--beam={}'.format(beam), - '--lattice-beam={}'.format(config.lattice_beam), - '--allow-partial=true', - '--word-symbol-table={}'.format(word_symbol_path), - '--acoustic-scale={}'.format(config.acoustic_scale), - '--num-threads={}'.format(num_threads), - mdl, hclg_path, "scp:" + feat_path, - "ark:" + lat_path], - stderr=logf) - decode_proc.communicate() - - -def score_func(directory, job_name, config, output_directory, language_model_weight=None, word_insertion_penalty=None): - lat_path = os.path.join(directory, 'lat.{}'.format(job_name)) - words_path = os.path.join(directory, 'words.txt') - tra_path = os.path.join(output_directory, 'tra.{}'.format(job_name)) - log_path = os.path.join(output_directory, 'log', 'score.{}.log'.format(job_name)) - if language_model_weight is None: - language_model_weight = config.language_model_weight - if word_insertion_penalty is None: - word_insertion_penalty = config.word_insertion_penalty - with open(log_path, 'w', encoding='utf8') as logf: - scale_proc = subprocess.Popen([thirdparty_binary('lattice-scale'), - '--inv-acoustic-scale={}'.format(language_model_weight), - 'ark:' + lat_path, 'ark:-' - ], stdout=subprocess.PIPE, stderr=logf) - penalty_proc = subprocess.Popen([thirdparty_binary('lattice-add-penalty'), - '--word-ins-penalty={}'.format(word_insertion_penalty), - 'ark:-', 'ark:-'], - stdin=scale_proc.stdout, stdout=subprocess.PIPE, stderr=logf) - best_path_proc = subprocess.Popen([thirdparty_binary('lattice-best-path'), - '--word-symbol-table={}'.format(words_path), - 'ark:-', 'ark,t:' + tra_path], stdin=penalty_proc.stdout, stderr=logf) - best_path_proc.communicate() - - -def transcribe(transcriber): - """ - """ - directory = transcriber.transcribe_directory - output_directory = transcriber.transcribe_directory - config = transcriber.transcribe_config - mdl_path = os.path.join(directory, 'final.mdl') - corpus = transcriber.corpus - num_jobs = corpus.num_jobs - feat_name = config.feature_file_base_name - feat_name += '.{}.scp' - - if config.use_mp and num_jobs > 1: - jobs = [(directory, x, mdl_path, config, os.path.join(corpus.split_directory(), feat_name.format(x)), - output_directory) - for x in range(num_jobs)] - else: - jobs = [(directory, x, mdl_path, config, os.path.join(corpus.split_directory(), feat_name.format(x)), - output_directory, corpus.original_num_jobs) - for x in range(num_jobs)] - - if config.use_mp and num_jobs > 1: - run_mp(decode_func, jobs) - else: - run_non_mp(decode_func, jobs) - - if transcriber.evaluation_mode: - best_wer = 10000 - best = None - for lmwt in range(transcriber.min_language_model_weight, transcriber.max_language_model_weight): - for wip in transcriber.word_insertion_penalties: - out_dir = os.path.join(output_directory, 'eval_{}_{}'.format(lmwt, wip)) - log_dir = os.path.join(out_dir, 'log') - os.makedirs(log_dir, exist_ok=True) - jobs = [(directory, x, config, out_dir, lmwt, wip) - for x in range(num_jobs)] - if config.use_mp: - run_mp(score_func, jobs) - else: - run_non_mp(score_func, jobs) - ser, wer = transcriber.evaluate(out_dir, out_dir) - if wer < best_wer: - best = (lmwt, wip) - transcriber.transcribe_config.language_model_weight = best[0] - transcriber.transcribe_config.word_insertion_penalty = best[1] - else: - jobs = [(directory, x, config, output_directory) - for x in range(num_jobs)] - if config.use_mp: - run_mp(score_func, jobs) - else: - run_non_mp(score_func, jobs) - - -def initial_fmllr_func(directory, split_directory, sil_phones, job_name, mdl, config, feat_path, output_directory, - num_threads=None): - feat_scp = config.feature_config.feature_id - if '_fmllr' not in feat_scp: - feat_scp += '_fmllr' - feat_scp += '.{}.scp'.format(job_name) - feat_ark = feat_scp.replace('.scp', '.ark') - base_scp = os.path.join(split_directory, feat_scp.replace('_fmllr', '')) - - log_path = os.path.join(output_directory, 'log', 'initial_fmllr.{}.log'.format(job_name)) - pre_trans_path = os.path.join(output_directory, 'pre_trans.{}'.format(job_name)) - lat_path = os.path.join(directory, 'lat.{}'.format(job_name)) - spk2utt_path = os.path.join(split_directory, 'spk2utt.{}'.format(job_name)) - utt2spk_path = os.path.join(split_directory, 'utt2spk.{}'.format(job_name)) - feat_fmllr_scp_path = os.path.join(split_directory, feat_scp) - feat_fmllr_ark_path = os.path.join(split_directory, - feat_ark) - with open(log_path, 'w', encoding='utf8') as logf: - latt_post_proc = subprocess.Popen([thirdparty_binary('lattice-to-post'), - '--acoustic-scale={}'.format(config.acoustic_scale), - 'ark:' + lat_path, 'ark:-'], stdout=subprocess.PIPE, - stderr=logf) - weight_silence_proc = subprocess.Popen([thirdparty_binary('weight-silence-post'), - str(config.silence_weight), - sil_phones, mdl, 'ark:-', 'ark:-'], - stdin=latt_post_proc.stdout, stdout=subprocess.PIPE, - stderr=logf) - gmm_gpost_proc = subprocess.Popen([thirdparty_binary('gmm-post-to-gpost'), - mdl, 'scp:' + feat_path, 'ark:-', 'ark:-'], - stdin=weight_silence_proc.stdout, stdout=subprocess.PIPE, - stderr=logf) - fmllr_proc = subprocess.Popen([thirdparty_binary('gmm-est-fmllr-gpost'), - '--fmllr-update-type={}'.format(config.fmllr_update_type), - '--spk2utt=ark:' + spk2utt_path, mdl, 'scp:' + feat_path, - 'ark,s,cs:-', 'ark:' + pre_trans_path], - stdin=gmm_gpost_proc.stdout, stdout=subprocess.PIPE, stderr=logf) - fmllr_proc.communicate() - # error - subprocess.call([thirdparty_binary('transform-feats'), - '--utt2spk=ark:' + utt2spk_path, - 'ark:' + pre_trans_path, 'scp:' + base_scp, - 'ark,scp:{},{}'.format(feat_fmllr_ark_path, feat_fmllr_scp_path)], - stderr=logf) - - -def lat_gen_fmllr_func(directory, split_directory, sil_phones, job_name, mdl, config, feat_path, output_directory, - num_threads=None): - feat_scp = config.feature_config.feature_id - if '_fmllr' not in feat_scp: - feat_scp += '_fmllr' - feat_scp += '.{}.scp'.format(job_name) - log_path = os.path.join(output_directory, 'log', 'lat_gen.{}.log'.format(job_name)) - word_symbol_path = os.path.join(directory, 'words.txt') - hclg_path = os.path.join(directory, 'HCLG.fst') - tmp_lat_path = os.path.join(output_directory, 'lat.tmp.{}'.format(job_name)) - feat_fmllr_scp_path = os.path.join(split_directory, feat_scp) - with open(log_path, 'w', encoding='utf8') as logf: - if num_threads is None: - lat_gen_proc = subprocess.Popen([thirdparty_binary('gmm-latgen-faster'), - '--max-active={}'.format(config.max_active), - '--beam={}'.format(config.beam), - '--lattice-beam={}'.format(config.lattice_beam), - '--acoustic-scale={}'.format(config.acoustic_scale), - '--determinize-lattice=false', - '--allow-partial=true', - '--word-symbol-table={}'.format(word_symbol_path), - mdl, hclg_path, 'scp:' + feat_fmllr_scp_path, 'ark:' + tmp_lat_path - ], stderr=logf) - else: - lat_gen_proc = subprocess.Popen([thirdparty_binary('gmm-latgen-faster-parallel'), - '--max-active={}'.format(config.max_active), - '--beam={}'.format(config.beam), - '--lattice-beam={}'.format(config.lattice_beam), - '--acoustic-scale={}'.format(config.acoustic_scale), - '--determinize-lattice=false', - '--allow-partial=true', - '--num-threads={}'.format(num_threads), - '--word-symbol-table={}'.format(word_symbol_path), - mdl, hclg_path, 'scp:' + feat_fmllr_scp_path, 'ark:' + tmp_lat_path - ], stderr=logf) - lat_gen_proc.communicate() - - -def final_fmllr_est_func(directory, split_directory, sil_phones, job_name, mdl, config, feat_path, output_directory, - num_threads=None): - feat_scp = config.feature_config.feature_id - if '_fmllr' not in feat_scp: - feat_scp += '_fmllr' - feat_scp += '.{}.scp'.format(job_name) - feat_ark = feat_scp.replace('.scp', '.ark') - base_scp = os.path.join(split_directory, feat_scp.replace('_fmllr', '')) - log_path = os.path.join(output_directory, 'log', 'final_fmllr.{}.log'.format(job_name)) - pre_trans_path = os.path.join(output_directory, 'pre_trans.{}'.format(job_name)) - trans_tmp_path = os.path.join(output_directory, 'trans_tmp.{}'.format(job_name)) - trans_path = os.path.join(output_directory, 'trans.{}'.format(job_name)) - lat_path = os.path.join(directory, 'lat.{}'.format(job_name)) - spk2utt_path = os.path.join(split_directory, 'spk2utt.{}'.format(job_name)) - tmp_lat_path = os.path.join(output_directory, 'lat.tmp.{}'.format(job_name)) - utt2spk_path = os.path.join(split_directory, 'utt2spk.{}'.format(job_name)) - feat_fmllr_scp_path = os.path.join(split_directory, feat_scp) - feat_fmllr_ark_path = os.path.join(split_directory, feat_ark) - with open(log_path, 'w', encoding='utf8') as logf: - if num_threads is None: - determinize_proc = subprocess.Popen([thirdparty_binary('lattice-determinize-pruned'), - '--acoustic-scale={}'.format(config.acoustic_scale), - '--beam=4.0', 'ark:' + tmp_lat_path, 'ark:-'], - stderr=logf, stdout=subprocess.PIPE) - else: - determinize_proc = subprocess.Popen([thirdparty_binary('lattice-determinize-pruned-parallel'), - '--acoustic-scale={}'.format(config.acoustic_scale), - '--num-threads={}'.format(num_threads), - '--beam=4.0', 'ark:' + tmp_lat_path, 'ark:-'], - stderr=logf, stdout=subprocess.PIPE) - latt_post_proc = subprocess.Popen([thirdparty_binary('lattice-to-post'), - '--acoustic-scale={}'.format(config.acoustic_scale), - 'ark:' + lat_path, 'ark:-'], - stdin=determinize_proc.stdout, stdout=subprocess.PIPE, stderr=logf) - weight_silence_proc = subprocess.Popen([thirdparty_binary('weight-silence-post'), - str(config.silence_weight), - sil_phones, mdl, 'ark:-', 'ark:-'], - stdin=latt_post_proc.stdout, stdout=subprocess.PIPE, - stderr=logf) - fmllr_proc = subprocess.Popen([thirdparty_binary('gmm-est-fmllr'), - '--fmllr-update-type={}'.format(config.fmllr_update_type), - '--spk2utt=ark:' + spk2utt_path, mdl, 'scp:' + feat_fmllr_scp_path, - 'ark,s,cs:-', 'ark:' + trans_tmp_path], - stdin=weight_silence_proc.stdout, stdout=subprocess.PIPE, stderr=logf) - fmllr_proc.communicate() - - compose_proc = subprocess.Popen([thirdparty_binary('compose-transforms'), - '--b-is-affine=true', 'ark:' + trans_tmp_path, - 'ark:' + pre_trans_path, 'ark:' + trans_path], - stderr=logf) - compose_proc.communicate() - - subprocess.call([thirdparty_binary('transform-feats'), - '--utt2spk=ark:' + utt2spk_path, - 'ark:' + trans_path, 'scp:' + base_scp, - 'ark,scp:{},{}'.format(feat_fmllr_ark_path, feat_fmllr_scp_path)], - stderr=logf) - - -def fmllr_rescore_func(directory, split_directory, sil_phones, job_name, mdl, config, feat_path, output_directory, - num_threads=None): - log_path = os.path.join(output_directory, 'log', 'fmllr_rescore.{}.log'.format(job_name)) - tmp_lat_path = os.path.join(output_directory, 'lat.tmp.{}'.format(job_name)) - final_lat_path = os.path.join(output_directory, 'lat.{}'.format(job_name)) - feat_fmllr_scp_path = os.path.join(split_directory, - config.feature_config.feature_id + '.{}.scp'.format(job_name)) - with open(log_path, 'w', encoding='utf8') as logf: - rescore_proc = subprocess.Popen([thirdparty_binary('gmm-rescore-lattice'), - mdl, 'ark:' + tmp_lat_path, - 'scp:' + feat_fmllr_scp_path, 'ark:-'], - stdout=subprocess.PIPE, stderr=logf) - if num_threads is None: - determinize_proc = subprocess.Popen([thirdparty_binary('lattice-determinize-pruned'), - '--acoustic-scale={}'.format(config.acoustic_scale), - '--beam={}'.format(config.lattice_beam), - 'ark:-', 'ark:' + final_lat_path - ], stdin=rescore_proc.stdout, stderr=logf) - else: - determinize_proc = subprocess.Popen([thirdparty_binary('lattice-determinize-pruned-parallel'), - '--acoustic-scale={}'.format(config.acoustic_scale), - '--beam={}'.format(config.lattice_beam), - '--num-threads={}'.format(num_threads), - 'ark:-', 'ark:' + final_lat_path - ], stdin=rescore_proc.stdout, stderr=logf) - determinize_proc.communicate() - - -def transcribe_fmllr(transcriber): - directory = transcriber.transcribe_directory - output_directory = transcriber.transcribe_directory - config = transcriber.transcribe_config - corpus = transcriber.corpus - num_jobs = corpus.num_jobs - split_directory = corpus.split_directory() - sil_phones = transcriber.dictionary.optional_silence_csl - - fmllr_directory = os.path.join(output_directory, 'fmllr') - log_dir = os.path.join(fmllr_directory, 'log') - os.makedirs(log_dir, exist_ok=True) - mdl_path = os.path.join(directory, 'final.mdl') - feat_name = config.feature_file_base_name - feat_name += '.{}.scp' - if num_jobs > 1: - jobs = [(directory, split_directory, sil_phones, x, mdl_path, config, - os.path.join(split_directory, feat_name.format(x)), fmllr_directory) - for x in range(num_jobs)] - else: - jobs = [(directory, split_directory, sil_phones, x, mdl_path, config, - os.path.join(split_directory, feat_name.format(x)), fmllr_directory, corpus.original_num_jobs) - for x in range(num_jobs)] - - run_non_mp(initial_fmllr_func, jobs) - - if config.use_mp and num_jobs > 1: - run_mp(lat_gen_fmllr_func, jobs) - else: - run_non_mp(lat_gen_fmllr_func, jobs) - - run_non_mp(final_fmllr_est_func, jobs) - - if config.use_mp: - run_mp(fmllr_rescore_func, jobs) - else: - run_non_mp(fmllr_rescore_func, jobs) - - if transcriber.evaluation_mode: - best_wer = 10000 - best = None - for lmwt in range(transcriber.min_language_model_weight, transcriber.max_language_model_weight): - for wip in transcriber.word_insertion_penalties: - out_dir = os.path.join(fmllr_directory, 'eval_{}_{}'.format(lmwt, wip)) - log_dir = os.path.join(out_dir, 'log') - os.makedirs(log_dir, exist_ok=True) - jobs = [(directory, x, config, out_dir, lmwt, wip) - for x in range(num_jobs)] - if config.use_mp: - run_mp(score_func, jobs) - else: - run_non_mp(score_func, jobs) - ser, wer = transcriber.evaluate(out_dir, out_dir) - if wer < best_wer: - best = (lmwt, wip) - transcriber.transcribe_config.language_model_weight = best[0] - transcriber.transcribe_config.word_insertion_penalty = best[1] - out_dir = os.path.join(fmllr_directory, 'eval_{}_{}'.format(best[0], best[1])) - for j in range(num_jobs): - tra_path = os.path.join(out_dir, 'tra.{}'.format(j)) - saved_tra_path = os.path.join(fmllr_directory, 'tra.{}'.format(j)) - shutil.copyfile(tra_path, saved_tra_path) - else: - jobs = [(directory, x, config, fmllr_directory) - for x in range(num_jobs)] - if config.use_mp: - run_mp(score_func, jobs) - else: - run_non_mp(score_func, jobs) - - -def compile_information_func(log_directory, corpus, job_num): - align_path = os.path.join(log_directory, 'align.final.{}.log'.format(job_num)) - unaligned = {} - output_path = os.path.join(log_directory, 'unaligned.{}.log'.format(job_num)) - with open(align_path, 'r', encoding='utf8') as f: - for line in f: - m = re.search(r'Did not successfully decode file (.*?),', line) - if m is not None: - utt = m.groups()[0] - unaligned[utt] = 'Could not decode (beam too narrow)' - features_path = os.path.join(corpus.split_directory(), 'log', 'make_mfcc.{}.log'.format(job_num)) - with open(features_path, 'r', encoding='utf8') as f: - for line in f: - m = re.search(r'Segment (.*?) too short', line) - if m is not None: - utt = m.groups()[0] - unaligned[utt] = 'Too short to get features' - with open(output_path, 'w', encoding='utf8') as f: - for k, v in unaligned.items(): - f.write('{} {}\n'.format(k, v)) - - -def compile_information(model_directory, corpus, num_jobs, config): - log_dir = os.path.join(model_directory, 'log') - - jobs = [(log_dir, corpus, x) - for x in range(num_jobs)] - - run_non_mp(compile_information_func, jobs) - - unaligned = {} - for j in jobs: - path = os.path.join(log_dir, 'unaligned.{}.log'.format(j[-1])) - with open(path, 'r', encoding='utf8') as f: - for line in f: - line = line.strip() - utt, reason = line.split(' ', maxsplit=1) - unaligned[utt] = reason - return unaligned - - -def compute_alignment_improvement_func(iteration, config, model_directory, job_name): - try: - text_int_path = os.path.join(config.data_directory, 'text.{}.int'.format(job_name)) - log_path = os.path.join(model_directory, 'log', 'get_ctm.{}.{}.log'.format(iteration, job_name)) - ali_path = os.path.join(model_directory, 'ali.{}'.format(job_name)) - model_path = os.path.join(model_directory, '{}.mdl'.format(iteration)) - phone_ctm_path = os.path.join(model_directory, 'phone.{}.{}.ctm'.format(iteration, job_name)) - if os.path.exists(phone_ctm_path): - return - - frame_shift = config.feature_config.frame_shift / 1000 - with open(log_path, 'w', encoding='utf8') as logf: - lin_proc = subprocess.Popen([thirdparty_binary('linear-to-nbest'), "ark:" + ali_path, - "ark:" + text_int_path, - '', '', 'ark:-'], - stdout=subprocess.PIPE, stderr=logf) - align_proc = subprocess.Popen([thirdparty_binary('lattice-align-words'), - os.path.join(config.dictionary.phones_dir, 'word_boundary.int'), model_path, - 'ark:-', 'ark:-'], - stdin=lin_proc.stdout, stderr=logf, - stdout=subprocess.PIPE) - phone_proc = subprocess.Popen([thirdparty_binary('lattice-to-phone-lattice'), model_path, - 'ark:-', "ark:-"], - stdin=align_proc.stdout, - stdout=subprocess.PIPE, - stderr=logf) - nbest_proc = subprocess.Popen([thirdparty_binary('nbest-to-ctm'), - '--frame-shift={}'.format(frame_shift), - "ark:-", phone_ctm_path], - stdin=phone_proc.stdout, - stderr=logf) - nbest_proc.communicate() - mapping = config.dictionary.reversed_phone_mapping - actual_lines = [] - with open(phone_ctm_path, 'r', encoding='utf8') as f: - for line in f: - line = line.strip() - if line == '': - continue - line = line.split(' ') - utt = line[0] - begin = Decimal(line[2]) - duration = Decimal(line[3]) - end = begin + duration - label = line[4] - try: - label = mapping[int(label)] - except KeyError: - pass - for p in config.dictionary.positions: - if label.endswith(p): - label = label[:-1 * len(p)] - actual_lines.append([utt, begin, end, label]) - with open(phone_ctm_path, 'w', encoding='utf8') as f: - for line in actual_lines: - f.write('{}\n'.format(' '.join(map(str, line)))) - except Exception as e: - raise (Exception(str(e))) - - -def parse_iteration_alignments(directory, iteration, num_jobs): - data = {} - for j in range(num_jobs): - phone_ctm_path = os.path.join(directory, 'phone.{}.{}.ctm'.format(iteration, j)) - with open(phone_ctm_path, 'r', encoding='utf8') as f: - for line in f: - line = line.strip() - if line == '': - continue - line = line.split(' ') - utt = line[0] - begin = Decimal(line[1]) - end = Decimal(line[2]) - label = line[3] - if utt not in data: - data[utt] = [] - data[utt].append([begin, end, label]) - return data - - -def compare_alignments(alignments_one, alignments_two, frame_shift): - utterances_aligned_diff = len(alignments_two) - len(alignments_one) - utts_one = set(alignments_one.keys()) - utts_two = set(alignments_two.keys()) - common_utts = utts_one.intersection(utts_two) - differences = [] - for u in common_utts: - end = alignments_one[u][-1][1] - t = Decimal('0.0') - one_alignment = alignments_one[u] - two_alignment = alignments_two[u] - difference = 0 - while t < end: - one_label = None - two_label = None - for b, e, l in one_alignment: - if t < b: - continue - if t >= e: - break - one_label = l - for b, e, l in two_alignment: - if t < b: - continue - if t >= e: - break - two_label = l - if one_label != two_label: - difference += frame_shift - t += frame_shift - difference /= end - differences.append(difference) - if differences: - mean_difference = statistics.mean(differences) - else: - mean_difference = 'N/A' - return utterances_aligned_diff, mean_difference - - -def compute_alignment_improvement(iteration, config, model_directory, num_jobs): - jobs = [(iteration, config, model_directory, x) for x in range(num_jobs)] - if config.use_mp: - run_mp(compute_alignment_improvement_func, jobs) - else: - run_non_mp(compute_alignment_improvement_func, jobs) - - alignment_diff_path = os.path.join(model_directory, 'train_change.csv') - if iteration == 0 or iteration not in config.realignment_iterations: - return - ind = config.realignment_iterations.index(iteration) - if ind != 0: - previous_iteration = config.realignment_iterations[ind - 1] - else: - previous_iteration = 0 - try: - previous_alignments = parse_iteration_alignments(model_directory, previous_iteration, num_jobs) - except FileNotFoundError: - return - current_alignments = parse_iteration_alignments(model_directory, iteration, num_jobs) - utterance_aligned_diff, mean_difference = compare_alignments(previous_alignments, current_alignments, - config.feature_config.frame_shift) - if not os.path.exists(alignment_diff_path): - with open(alignment_diff_path, 'w', encoding='utf8') as f: - f.write('iteration,number_aligned,number_previously_aligned,' - 'difference_in_utts_aligned,mean_boundary_change\n') - if iteration in config.realignment_iterations: - with open(alignment_diff_path, 'a', encoding='utf8') as f: - f.write('{},{},{},{},{}\n'.format(iteration, len(current_alignments), - len(previous_alignments), utterance_aligned_diff, mean_difference)) - if not config.debug: - for j in range(num_jobs): - phone_ctm_path = os.path.join(model_directory, 'phone.{}.{}.ctm'.format(previous_iteration, j)) - os.remove(phone_ctm_path) - - -def ali_to_textgrid_func(align_config, model_directory, dictionary, corpus, job_name): - text_int_path = os.path.join(corpus.split_directory(), 'text.{}.int'.format(job_name)) - log_path = os.path.join(model_directory, 'log', 'get_ctm_align.{}.log'.format(job_name)) - ali_path = os.path.join(model_directory, 'ali.{}'.format(job_name)) - model_path = os.path.join(model_directory, 'final.mdl') - aligned_path = os.path.join(model_directory, 'aligned.{}'.format(job_name)) - nbest_path = os.path.join(model_directory, 'nbest.{}'.format(job_name)) - word_ctm_path = os.path.join(model_directory, 'word_ctm.{}'.format(job_name)) - phone_ctm_path = os.path.join(model_directory, 'phone_ctm.{}'.format(job_name)) - - frame_shift = align_config.feature_config.frame_shift / 1000 - with open(log_path, 'w', encoding='utf8') as logf: - lin_proc = subprocess.Popen([thirdparty_binary('linear-to-nbest'), "ark:" + ali_path, - "ark:" + text_int_path, - '', '', 'ark,t:' + nbest_path], - stdout=subprocess.PIPE, stderr=logf) - - lin_proc.communicate() - lin_proc = subprocess.Popen([thirdparty_binary('linear-to-nbest'), "ark:" + ali_path, - "ark:" + text_int_path, - '', '', 'ark:-'], - stdout=subprocess.PIPE, stderr=logf) - align_proc = subprocess.Popen([thirdparty_binary('lattice-align-words'), - os.path.join(dictionary.phones_dir, 'word_boundary.int'), model_path, - 'ark:-', 'ark,t:' + aligned_path], - stdin=lin_proc.stdout, stderr=logf) - align_proc.communicate() - - subprocess.call([thirdparty_binary('nbest-to-ctm'), - '--frame-shift={}'.format(frame_shift), - 'ark:' + aligned_path, - word_ctm_path], - stderr=logf) - phone_proc = subprocess.Popen([thirdparty_binary('lattice-to-phone-lattice'), model_path, - 'ark:' + aligned_path, "ark:-"], - stdout=subprocess.PIPE, - stderr=logf) - nbest_proc = subprocess.Popen([thirdparty_binary('nbest-to-ctm'), - '--frame-shift={}'.format(frame_shift), - "ark:-", phone_ctm_path], - stdin=phone_proc.stdout, - stderr=logf) - nbest_proc.communicate() - - -def convert_ali_to_textgrids(align_config, output_directory, model_directory, dictionary, corpus, num_jobs, config): - """ - Multiprocessing function that aligns based on the current model - - See: - - - http://kaldi-asr.org/doc/linear-to-nbest_8cc.html - - http://kaldi-asr.org/doc/lattice-align-words_8cc.html - - http://kaldi-asr.org/doc/lattice-to-phone-lattice_8cc.html - - http://kaldi-asr.org/doc/nbest-to-ctm_8cc.html - - for more details - on the Kaldi binaries this function calls. - - Also see https://github.com/kaldi-asr/kaldi/blob/master/egs/wsj/s5/steps/get_train_ctm.sh - for the bash script that this function was based on. - - Parameters - ---------- - output_directory : str - Directory to write TextGrid files to - model_directory : str - Directory of training (monophone, triphone, speaker-adapted triphone - training directories) - dictionary : :class:`~aligner.dictionary.Dictionary` - Dictionary object that has information about pronunciations - corpus : :class:`~aligner.corpus.Corpus` - Corpus object that has information about the dataset - num_jobs : int - The number of processes to use in calculation - - Raises - ------ - CorpusError - If the files per speaker exceeds the number of files that are - allowed to be open on the computer (for Unix-based systems) - - """ - jobs = [(align_config, model_directory, dictionary, corpus, x) - for x in range(num_jobs)] - if align_config.use_mp: - run_mp(ali_to_textgrid_func, jobs) - else: - run_non_mp(ali_to_textgrid_func, jobs) - - word_ctm = {} - phone_ctm = {} - for i in range(num_jobs): - word_ctm_path = os.path.join(model_directory, 'word_ctm.{}'.format(i)) - phone_ctm_path = os.path.join(model_directory, 'phone_ctm.{}'.format(i)) - if not os.path.exists(word_ctm_path): - continue - parsed = parse_ctm(word_ctm_path, corpus, dictionary, mode='word') - for k, v in parsed.items(): - if k not in word_ctm: - word_ctm[k] = v - else: - word_ctm[k].update(v) - parsed = parse_ctm(phone_ctm_path, corpus, dictionary, mode='phone') - for k, v in parsed.items(): - if k not in phone_ctm: - phone_ctm[k] = v - else: - phone_ctm[k].update(v) - ctm_to_textgrid(word_ctm, phone_ctm, output_directory, corpus, dictionary) - - -def generate_pronunciations_func(align_config, model_directory, dictionary, corpus, job_name): - text_int_path = os.path.join(corpus.split_directory(), 'text.{}.int'.format(job_name)) - log_path = os.path.join(model_directory, 'log', 'pronunciation.{}.log'.format(job_name)) - ali_path = os.path.join(model_directory, 'ali.{}'.format(job_name)) - model_path = os.path.join(model_directory, 'final.mdl') - aligned_path = os.path.join(model_directory, 'aligned.{}'.format(job_name)) - nbest_path = os.path.join(model_directory, 'nbest.{}'.format(job_name)) - pron_path = os.path.join(model_directory, 'prons.{}'.format(job_name)) - with open(log_path, 'w', encoding='utf8') as logf: - lin_proc = subprocess.Popen([thirdparty_binary('linear-to-nbest'), "ark:" + ali_path, - "ark:" + text_int_path, - '', '', 'ark,t:' + nbest_path], - stdout=subprocess.PIPE, stderr=logf) - - lin_proc.communicate() - lin_proc = subprocess.Popen([thirdparty_binary('linear-to-nbest'), "ark:" + ali_path, - "ark:" + text_int_path, - '', '', 'ark:-'], - stdout=subprocess.PIPE, stderr=logf) - align_proc = subprocess.Popen([thirdparty_binary('lattice-align-words'), - os.path.join(dictionary.phones_dir, 'word_boundary.int'), model_path, - 'ark:-', 'ark,t:' + aligned_path], - stdin=lin_proc.stdout, stderr=logf) - align_proc.communicate() - - subprocess.call([thirdparty_binary('nbest-to-prons'), - model_path, - 'ark:' + aligned_path, - pron_path], - stderr=logf) - - -def generate_pronunciations(align_config, model_directory, dictionary, corpus, num_jobs): - from collections import Counter, defaultdict - jobs = [(align_config, model_directory, dictionary, corpus, x) - for x in range(num_jobs)] - if align_config.use_mp: - run_mp(generate_pronunciations_func, jobs) - else: - run_non_mp(generate_pronunciations_func, jobs) - - word_lookup = dictionary.reversed_word_mapping - phone_lookup = dictionary.reversed_phone_mapping - pron_counts = defaultdict(Counter) - for j in range(num_jobs): - pron_path = os.path.join(model_directory, 'prons.{}'.format(j)) - with open(pron_path, 'r', encoding='utf8') as f: - utt_mapping = {} - last_utt = None - for line in f: - line = line.split() - utt = line[0] - if utt not in utt_mapping: - if last_utt is not None: - utt_mapping[last_utt].append('') - utt_mapping[utt] = [''] - last_utt = utt - - begin = line[1] - end = line[2] - word = word_lookup[int(line[3])] - if word == '': - utt_mapping[utt].append(word) - else: - pron = tuple(phone_lookup[int(x)].split('_')[0] for x in line[4:]) - pron_string = ' '.join(pron) - utt_mapping[utt].append(word + ' ' + pron_string) - pron_counts[word][pron] += 1 - print(word, pron) - return pron_counts, utt_mapping - - -def tree_stats_func(directory, ci_phones, mdl, feat_path, ali_path, job_name): - context_opts = [] - log_path = os.path.join(directory, 'log', 'acc_tree.{}.log'.format(job_name)) - - treeacc_path = os.path.join(directory, '{}.treeacc'.format(job_name)) - - with open(log_path, 'w', encoding='utf8') as logf: - subprocess.call([thirdparty_binary('acc-tree-stats')] + context_opts + - ['--ci-phones=' + ci_phones, mdl, "scp:" + feat_path, - "ark:" + ali_path, - treeacc_path], stderr=logf) - - -def tree_stats(directory, align_directory, split_directory, ci_phones, num_jobs, config): - """ - Multiprocessing function that computes stats for decision tree training - - See http://kaldi-asr.org/doc/acc-tree-stats_8cc.html for more details - on the Kaldi binary this runs. - - Parameters - ---------- - directory : str - Directory of training (triphone, speaker-adapted triphone - training directories) - align_directory : str - Directory of previous alignment - split_directory : str - Directory of training data split into the number of jobs - ci_phones : str - Colon-separated list of context-independent phones - num_jobs : int - The number of processes to use in calculation - """ - feat_name = config.feature_file_base_name - - if '_fmllr' in feat_name: - feat_name = feat_name.replace('_fmllr', '') - - feat_name += '.{}.scp' - mdl_path = os.path.join(align_directory, 'final.mdl') - - jobs = [(directory, ci_phones, mdl_path, - os.path.join(split_directory, feat_name.format(x)), - os.path.join(align_directory, 'ali.{}'.format(x)), x) - for x in range(num_jobs)] - - if config.use_mp: - run_mp(tree_stats_func, jobs) - else: - run_non_mp(tree_stats_func, jobs) - - tree_accs = [os.path.join(directory, '{}.treeacc'.format(x)) for x in range(num_jobs)] - log_path = os.path.join(directory, 'log', 'sum_tree_acc.log') - with open(log_path, 'w', encoding='utf8') as logf: - subprocess.call([thirdparty_binary('sum-tree-stats'), os.path.join(directory, 'treeacc')] + - tree_accs, stderr=logf) - # for f in tree_accs: - # os.remove(f) - - -def convert_alignments_func(directory, align_directory, job_name): - mdl_path = os.path.join(directory, '1.mdl') - tree_path = os.path.join(directory, 'tree') - ali_mdl_path = os.path.join(align_directory, 'final.mdl') - ali_path = os.path.join(align_directory, 'ali.{}'.format(job_name)) - new_ali_path = os.path.join(directory, 'ali.{}'.format(job_name)) - - log_path = os.path.join(directory, 'log', 'convert.{}.log'.format(job_name)) - with open(log_path, 'w', encoding='utf8') as logf: - subprocess.call([thirdparty_binary('convert-ali'), ali_mdl_path, - mdl_path, tree_path, "ark:" + ali_path, - "ark:" + new_ali_path], stderr=logf) - - -def convert_alignments(directory, align_directory, num_jobs, config): - """ - Multiprocessing function that converts alignments from previous training - - See http://kaldi-asr.org/doc/convert-ali_8cc.html for more details - on the Kaldi binary this runs. - - Parameters - ---------- - directory : str - Directory of training (triphone, speaker-adapted triphone - training directories) - align_directory : str - Directory of previous alignment - num_jobs : int - The number of processes to use in calculation - - """ - - jobs = [(directory, align_directory, x) - for x in range(num_jobs)] - if config.use_mp: - run_mp(convert_alignments_func, jobs) - else: - run_non_mp(convert_alignments_func, jobs) - - -def calc_fmllr_func(directory, split_directory, sil_phones, job_name, config, initial, - model_name='final'): - feat_scp = config.feature_config.feature_id + '.{}.scp'.format(job_name) - base_scp = feat_scp.replace('_fmllr', '') - if initial: - feat_scp = base_scp - feat_scp = os.path.join(split_directory, feat_scp) - base_scp = os.path.join(split_directory, base_scp) - - log_path = os.path.join(directory, 'log', 'fmllr.{}.{}.log'.format(model_name, job_name)) - ali_path = os.path.join(directory, 'ali.{}'.format(job_name)) - mdl_path = os.path.join(directory, '{}.mdl'.format(model_name)) - spk2utt_path = os.path.join(split_directory, 'spk2utt.{}'.format(job_name)) - if not initial: - tmp_trans_path = os.path.join(directory, 'trans.temp.{}'.format(job_name)) - trans_path = os.path.join(directory, 'trans.{}'.format(job_name)) - cmp_trans_path = os.path.join(directory, 'trans.cmp.{}'.format(job_name)) - else: - tmp_trans_path = os.path.join(directory, 'trans.{}'.format(job_name)) - post_path = os.path.join(directory, 'post.{}'.format(job_name)) - weight_path = os.path.join(directory, 'weight.{}'.format(job_name)) - with open(log_path, 'w', encoding='utf8') as logf: - subprocess.call([thirdparty_binary('ali-to-post'), - "ark:" + ali_path, 'ark:' + post_path], stderr=logf) - - subprocess.call([thirdparty_binary('weight-silence-post'), '0.0', - sil_phones, mdl_path, 'ark:' + post_path, - 'ark:' + weight_path], stderr=logf) - - subprocess.call([thirdparty_binary('gmm-est-fmllr'), - '--verbose=4', - '--fmllr-update-type={}'.format(config.fmllr_update_type), - '--spk2utt=ark:' + spk2utt_path, mdl_path, "scp:" + feat_scp, - 'ark,s,cs:' + weight_path, 'ark:' + tmp_trans_path], - stderr=logf) - - if not initial: - subprocess.call([thirdparty_binary('compose-transforms'), - '--b-is-affine=true', - 'ark:' + tmp_trans_path, 'ark:' + trans_path, - 'ark:' + cmp_trans_path], stderr=logf) - os.remove(tmp_trans_path) - os.remove(trans_path) - os.rename(cmp_trans_path, trans_path) - else: - trans_path = tmp_trans_path - utt2spk_path = os.path.join(config.corpus.split_directory(), 'utt2spk.{}'.format(job_name)) - feat_fmllr_scp_path = os.path.join(config.corpus.split_directory(), - config.feature_config.feature_id + '.{}.scp'.format(job_name)) - feat_fmllr_ark_path = os.path.join(config.corpus.split_directory(), - config.feature_config.feature_id + '.{}.ark'.format(job_name)) - subprocess.call([thirdparty_binary('transform-feats'), - '--utt2spk=ark:' + utt2spk_path, - 'ark:' + trans_path, 'scp:' + base_scp, - 'ark,scp:{},{}'.format(feat_fmllr_ark_path, feat_fmllr_scp_path)], - stderr=logf) - - -def calc_fmllr(directory, split_directory, sil_phones, num_jobs, config, - initial=False, iteration=None): - """ - Multiprocessing function that computes speaker adaptation (fMLLR) - - See: - - - http://kaldi-asr.org/doc/gmm-est-fmllr_8cc.html - - http://kaldi-asr.org/doc/ali-to-post_8cc.html - - http://kaldi-asr.org/doc/weight-silence-post_8cc.html - - http://kaldi-asr.org/doc/compose-transforms_8cc.html - - http://kaldi-asr.org/doc/transform-feats_8cc.html - - for more details - on the Kaldi binary this runs. - - Also see https://github.com/kaldi-asr/kaldi/blob/master/egs/wsj/s5/steps/align_fmllr.sh - for the original bash script that this function was based on. - - Parameters - ---------- - directory : str - Directory of training (triphone, speaker-adapted triphone - training directories) - split_directory : str - Directory of training data split into the number of jobs - sil_phones : str - Colon-separated list of silence phones - num_jobs : int - The number of processes to use in calculation - config : :class:`~aligner.config.TriphoneFmllrConfig` - Configuration object for training - initial : bool, optional - Whether this is the first computation of speaker-adaptation, - defaults to False - iteration : int - Specifies the current iteration, defaults to None - - """ - if iteration is None: - if initial: - model_name = '1' - else: - model_name = 'final' - else: - model_name = iteration - jobs = [(directory, split_directory, sil_phones, x, config, initial, model_name) - for x in range(num_jobs)] - # if config.use_mp: - # run_mp(calc_fmllr_func, jobs) - # else: - run_non_mp(calc_fmllr_func, jobs) - - -def lda_acc_stats_func(directory, split_dir, align_directory, config, ci_phones, i): - log_path = os.path.join(directory, 'log', 'ali_to_post.{}.log'.format(i)) - with open(log_path, 'w', encoding='utf8') as logf: - spliced_feat_path = os.path.join(split_dir, config.feature_config.feature_id + '.{}.scp'.format(i)) - ali_to_post_proc = subprocess.Popen([thirdparty_binary('ali-to-post'), - 'ark:' + os.path.join(align_directory, 'ali.{}'.format(i)), - 'ark:-'], - stderr=logf, stdout=subprocess.PIPE) - weight_silence_post_proc = subprocess.Popen([thirdparty_binary('weight-silence-post'), - str(config.boost_silence), ci_phones, - os.path.join(align_directory, 'final.mdl'), - 'ark:-', 'ark:-'], - stdin=ali_to_post_proc.stdout, - stderr=logf, stdout=subprocess.PIPE) - acc_lda_post_proc = subprocess.Popen([thirdparty_binary('acc-lda'), - '--rand-prune=' + str(config.random_prune), - os.path.join(align_directory, 'final.mdl'), - 'scp:' + spliced_feat_path, - 'ark,s,cs:-', - os.path.join(directory, 'lda.{}.acc'.format(i))], - stdin=weight_silence_post_proc.stdout, - stderr=logf) - acc_lda_post_proc.communicate() - - -def lda_acc_stats(directory, split_directory, align_directory, config, ci_phones, num_jobs): - """ - Multiprocessing function that accumulates LDA statistics - - See: - - - http://kaldi-asr.org/doc/ali-to-post_8cc.html - - http://kaldi-asr.org/doc/weight-silence-post_8cc.html - - http://kaldi-asr.org/doc/acc-lda_8cc.html - - http://kaldi-asr.org/doc/est-lda_8cc.html - - for more details - on the Kaldi binary this runs. - - Also see https://github.com/kaldi-asr/kaldi/blob/master/egs/wsj/s5/steps/train_lda_mllt.sh - for the original bash script that this function was based on. - - Parameters - ---------- - directory : str - Directory of LDA+MLLT training - split_directory : str - Directory of training data split into the number of jobs - align_directory : str - Directory of previous alignment - config : :class:`~aligner.config.LdaMlltConfig` - Configuration object for training - ci_phones : str - Colon-separated list of context-independent phones - num_jobs : int - The number of processes to use in calculation - - """ - jobs = [(directory, split_directory, align_directory, config, ci_phones, x) for x in range(num_jobs)] - if config.use_mp: - run_mp(lda_acc_stats_func, jobs) - else: - run_non_mp(lda_acc_stats_func, jobs) - - log_path = os.path.join(directory, 'log', 'lda_est.log') - acc_list = [] - for x in range(num_jobs): - acc_list.append(os.path.join(directory, 'lda.{}.acc'.format(x))) - with open(log_path, 'w', encoding='utf8') as logf: - est_lda_proc = subprocess.Popen([thirdparty_binary('est-lda'), - '--write-full-matrix=' + os.path.join(directory, 'full.mat'), - '--dim=' + str(config.lda_dimension), - os.path.join(directory, 'lda.mat')] + acc_list, - stderr=logf) - est_lda_proc.communicate() - shutil.copyfile(os.path.join(directory, 'lda.mat'), os.path.join(config.corpus.split_directory(), 'lda.mat')) - config.feature_config.generate_features(config.corpus, overwrite=True) - - -def calc_lda_mllt_func(directory, split_directory, sil_phones, job_name, config, - initial, - model_name='final'): - log_path = os.path.join(directory, 'log', 'lda_mllt.{}.{}.log'.format(model_name, job_name)) - ali_path = os.path.join(directory, 'ali.{}'.format(job_name)) - if not initial: - mdl_path = os.path.join(directory, '{}.mdl'.format(model_name)) - else: - mdl_path = os.path.join(directory, '1.mdl') - model_name = 1 - - feat_path = os.path.join(split_directory, config.feature_config.feature_id + '.{}.scp'.format(job_name)) - post_path = os.path.join(directory, 'post.{}'.format(job_name)) - weight_path = os.path.join(directory, 'weight.{}'.format(job_name)) - - # Estimating MLLT - with open(log_path, 'a', encoding='utf8') as logf: - subprocess.call([thirdparty_binary('ali-to-post'), - "ark:" + ali_path, 'ark:' + post_path], stderr=logf) - - subprocess.call([thirdparty_binary('weight-silence-post'), '0.0', - sil_phones, mdl_path, 'ark:' + post_path, - 'ark:' + weight_path], stderr=logf) - subprocess.call([thirdparty_binary('gmm-acc-mllt'), - '--rand-prune=' + str(config.random_prune), - mdl_path, - 'scp:' + feat_path, - 'ark:' + post_path, - os.path.join(directory, '{}.{}.macc'.format(model_name, job_name))], - stderr=logf) - - -def calc_lda_mllt(directory, split_directory, sil_phones, num_jobs, config, - initial=False, iteration=None): - """ - Multiprocessing function that calculates LDA+MLLT transformations - - See: - - - http://kaldi-asr.org/doc/ali-to-post_8cc.html - - http://kaldi-asr.org/doc/weight-silence-post_8cc.html - - http://kaldi-asr.org/doc/gmm-acc-mllt_8cc.html - - http://kaldi-asr.org/doc/est-mllt_8cc.html - - http://kaldi-asr.org/doc/gmm-transform-means_8cc.html - - http://kaldi-asr.org/doc/compose-transforms_8cc.html - - for more details - on the Kaldi binary this runs. - - Also see https://github.com/kaldi-asr/kaldi/blob/master/egs/wsj/s5/steps/train_lda_mllt.sh - for the original bash script that this function was based on. - - Parameters - ---------- - directory : str - Directory of LDA+MLLT training - split_directory : str - Directory of training data split into the number of jobs - sil_phones : str - Colon-separated list of silence phones - num_jobs : int - The number of processes to use in calculation - config : :class:`~aligner.config.LdaMlltConfig` - Configuration object for training - initial : bool - Flag for first iteration - iteration : int - Current iteration - - """ - if iteration is None: - model_name = 'final' - else: - model_name = iteration - jobs = [ - (directory, split_directory, sil_phones, x, config, initial, model_name) - for x in range(num_jobs)] - if config.use_mp: - run_mp(calc_lda_mllt_func, jobs) - else: - run_non_mp(calc_lda_mllt_func, jobs) - - mdl_path = os.path.join(directory, '{}.mdl'.format(model_name)) - log_path = os.path.join(directory, 'log', 'transform_means.{}.log'.format(model_name)) - previous_mat_path = os.path.join(directory, 'lda.mat') - new_mat_path = os.path.join(directory, 'lda_new.mat') - composed_path = os.path.join(directory, 'lda_composed.mat') - with open(log_path, 'a', encoding='utf8') as logf: - macc_list = [] - for x in range(num_jobs): - macc_list.append(os.path.join(directory, '{}.{}.macc'.format(model_name, x))) - subprocess.call([thirdparty_binary('est-mllt'), - new_mat_path] - + macc_list, - stderr=logf) - subprocess.call([thirdparty_binary('gmm-transform-means'), - new_mat_path, - mdl_path, mdl_path], - stderr=logf) - - if os.path.exists(previous_mat_path): - subprocess.call([thirdparty_binary('compose-transforms'), - new_mat_path, - previous_mat_path, - composed_path], - stderr=logf) - os.remove(previous_mat_path) - os.rename(composed_path, previous_mat_path) - else: - os.rename(new_mat_path, previous_mat_path) - - shutil.copyfile(os.path.join(directory, 'lda.mat'), os.path.join(config.corpus.split_directory(), 'lda.mat')) - config.feature_config.generate_features(config.corpus, overwrite=True) - - -def gmm_gselect_func(config, x): - log_path = os.path.join(config.train_directory, 'log', 'gselect.{}.log'.format(x)) - feat_path = os.path.join(config.data_directory, config.feature_file_base_name + '.{}.scp'.format(x)) - with open(log_path, 'w', encoding='utf8') as logf: - subsample_feats_proc = subprocess.Popen([thirdparty_binary('subsample-feats'), - '--n=' + str(config.subsample), - 'scp:' + feat_path, - 'ark:-'], - stdout=subprocess.PIPE, - stderr=logf) - - gselect_proc = subprocess.Popen([thirdparty_binary('gmm-gselect'), - '--n=' + str(config.num_gselect), - os.path.join(config.train_directory, '1.dubm'), - 'ark:-', - 'ark:' + os.path.join(config.train_directory, 'gselect.{}'.format(x))], - stdin=subsample_feats_proc.stdout, - stderr=logf) - gselect_proc.communicate() - - -def gmm_gselect(config, num_jobs): - """ - Multiprocessing function that stores Gaussian selection indices on disk - - See: - - - http://kaldi-asr.org/doc/gmm-gselect_8cc.html - - for more details - on the Kaldi binary this runs. - - Also see https://github.com/kaldi-asr/kaldi/blob/master/egs/wsj/s5/steps/train_diag_ubm.sh - for the original bash script that this function was based on. - - Parameters - ---------- - config : :class:`~aligner.config.DiagUbmConfig` - Configuration object for training - num_jobs : int - The number of processes to use in calculation - - """ - jobs = [(config, x) for x in range(num_jobs)] - if config.use_mp: - run_mp(gmm_gselect_func, jobs) - else: - run_non_mp(gmm_gselect_func, jobs) - - -def acc_global_stats_func(config, x, iteration): - log_path = os.path.join(config.train_directory, 'log', 'acc.{}.{}.log'.format(iteration, x)) - feat_path = os.path.join(config.data_directory, config.feature_file_base_name + '.{}.scp'.format(x)) - with open(log_path, 'w', encoding='utf8') as logf: - subsample_feats_proc = subprocess.Popen([thirdparty_binary('subsample-feats'), - '--n=' + str(config.subsample), - 'scp:' + feat_path, - 'ark:-'], - stdout=subprocess.PIPE, - stderr=logf) - gmm_global_acc_proc = subprocess.Popen([thirdparty_binary('gmm-global-acc-stats'), - '--gselect=' + 'ark:' + os.path.join(config.train_directory, - 'gselect.{}'.format(x)), - os.path.join(config.train_directory, '{}.dubm'.format(iteration)), - 'ark:-', - os.path.join(config.train_directory, '{}.{}.acc'.format(iteration, x))], - stderr=logf, - stdin=subsample_feats_proc.stdout) - gmm_global_acc_proc.communicate() - - -def acc_global_stats(config, num_jobs, iteration): - """ - Multiprocessing function that accumulates global GMM stats - - See: - - - http://kaldi-asr.org/doc/gmm-global-acc-stats_8cc.html - - for more details - on the Kaldi binary this runs. - - Also see https://github.com/kaldi-asr/kaldi/blob/master/egs/wsj/s5/steps/train_diag_ubm.sh - for the original bash script that this function was based on. - - Parameters - ---------- - config : :class:`~aligner.config.DiagUbmConfig` - Configuration object for training - num_jobs : int - The number of processes to use in calculation - iteration : int - Iteration to calculate stats for - """ - jobs = [(config, x, iteration) for x in range(num_jobs)] - if config.use_mp: - run_mp(acc_global_stats_func, jobs) - else: - run_non_mp(acc_global_stats_func, jobs) - - -def gauss_to_post_func(config, x): - modified_posterior_scale = config.posterior_scale * config.subsample - log_path = os.path.join(config.train_directory, 'log', 'post.{}.log'.format(x)) - feat_path = os.path.join(config.data_directory, config.feature_file_base_name + '.{}.scp'.format(x)) - with open(log_path, 'w', encoding='utf8') as logf: - subsample_feats_proc = subprocess.Popen([thirdparty_binary('subsample-feats'), - '--n=' + str(config.subsample), - 'scp:' + feat_path, - 'ark:-'], - stdout=subprocess.PIPE, - stderr=logf) - gmm_global_get_post_proc = subprocess.Popen([thirdparty_binary('gmm-global-get-post'), - '--n=' + str(config.num_gselect), - '--min-post=' + str(config.min_post), - os.path.join(config.train_directory, 'final.dubm'), - 'ark:-', - 'ark:-'], - stdout=subprocess.PIPE, - stdin=subsample_feats_proc.stdout, - stderr=logf) - scale_post_proc = subprocess.Popen([thirdparty_binary('scale-post'), - 'ark:-', - str(modified_posterior_scale), - 'ark:' + os.path.join(config.train_directory, 'post.{}'.format(x))], - stdin=gmm_global_get_post_proc.stdout, - stderr=logf) - scale_post_proc.communicate() - - -def gauss_to_post(config, num_jobs): - """ - Multiprocessing function that does Gaussian selection and posterior extraction - - See: - - - http://kaldi-asr.org/doc/gmm-global-get-post_8cc.html - - http://kaldi-asr.org/doc/scale-post_8cc.html - - for more details - on the Kaldi binary this runs. - - Also see https://github.com/kaldi-asr/kaldi/blob/master/egs/wsj/s5/steps/online/nnet2/train_ivector_extractor.sh - for the original bash script that this function was based on. - - Parameters - ---------- - config : :class:`~aligner.config.iVectorExtractorConfig` - Configuration object for training - num_jobs : int - The number of processes to use in calculation - """ - jobs = [(config, x) for x in range(num_jobs)] - if config.use_mp: - run_mp(gauss_to_post_func, jobs) - else: - run_non_mp(gauss_to_post_func, jobs) - - -def acc_ivector_stats_func(config, x, iteration): - log_path = os.path.join(config.train_directory, 'log', 'acc.{}.{}.log'.format(iteration, x)) - feat_path = os.path.join(config.data_directory, config.feature_config.feature_id + '.{}.scp'.format(x)) - with open(log_path, 'w', encoding='utf8') as logf: - subsample_feats_proc = subprocess.Popen([thirdparty_binary('subsample-feats'), - '--n=' + str(config.subsample), - 'scp:' + feat_path, - 'ark:-'], - stdout=subprocess.PIPE, - stderr=logf) - acc_stats_proc = subprocess.Popen([thirdparty_binary('ivector-extractor-acc-stats'), - os.path.join(config.train_directory, '{}.ie'.format(iteration)), - 'ark:-', - 'ark:' + os.path.join(config.train_directory, 'post.{}'.format(x)), - os.path.join(config.train_directory, 'accinit.{}.{}'.format(iteration, x))], - stdin=subsample_feats_proc.stdout, - stderr=logf) - acc_stats_proc.communicate() - - -def acc_ivector_stats(config, num_jobs, iteration): - """ - Multiprocessing function that calculates i-vector extractor stats - - See: - - - http://kaldi-asr.org/doc/ivector-extractor-acc-stats_8cc.html - - http://kaldi-asr.org/doc/ivector-extractor-sum-accs_8cc.html - - for more details - on the Kaldi binary this runs. - - Also see https://github.com/kaldi-asr/kaldi/blob/master/egs/wsj/s5/steps/online/nnet2/train_ivector_extractor.sh - for the original bash script that this function was based on. - - Parameters - ---------- - config : :class:`~aligner.config.iVectorExtractorConfig` - Configuration object for training - num_jobs : int - The number of processes to use in calculation - iteration : int - Iteration to calculate stats for - """ - jobs = [(config, x, iteration) for x in range(num_jobs)] - if config.use_mp: - run_mp(acc_ivector_stats_func, jobs) - else: - run_non_mp(acc_ivector_stats_func, jobs) - - accinits = [os.path.join(config.train_directory, 'accinit.{}.{}'.format(iteration, j)) for j in range(num_jobs)] - log_path = os.path.join(config.train_directory, 'log', 'sum_acc.{}.log'.format(iteration)) - with open(log_path, 'w', encoding='utf8') as logf: - sum_accs_proc = subprocess.Popen([thirdparty_binary('ivector-extractor-sum-accs'), - '--parallel=true'] - + accinits - + [os.path.join(config.train_directory, 'acc.{}'.format(iteration))], - stderr=logf) - - sum_accs_proc.communicate() - - -def extract_ivectors_func(config, job_id): - """ - - Parameters - ---------- - config : :class:`~aligner.trainers.IvectorExtractorTrainer` - Configuration object for training - job_id : int - Job identifier - """ - - log_dir = os.path.join(config.align_directory, 'log') - os.makedirs(log_dir, exist_ok=True) - ivector_mdl = os.path.join(config.train_directory, 'final.ie') - log_path = os.path.join(config.align_directory, 'log', 'extract_ivectors.{}.log'.format(job_id)) - feat_name = config.feature_file_base_name - feat_name += '.{}.scp' - # features_path = os.path.join(config.data_directory, 'features_for_ivector.{}.scp'.format(x)) - ivectors_path = os.path.join(config.train_directory, 'ivectors.{}'.format(job_id)) - post_path = os.path.join(config.align_directory, 'post.{}'.format(job_id)) - ali_path = os.path.join(config.align_directory, 'ali.{}'.format(job_id)) - weight_path = os.path.join(config.align_directory, 'weight.{}'.format(job_id)) - mdl_path = os.path.join(config.align_directory, 'final.mdl') - gmm_feats = os.path.join(config.data_directory, feat_name.format(job_id)) - features_path = gmm_feats - spk2utt_path = os.path.join(config.data_directory, 'spk2utt.{}'.format(job_id)) - sil_phones = config.dictionary.silence_csl - - silence_weight = 0.0 - posterior_scale = 0.1 - max_count = 100 - with open(log_path, 'w', encoding='utf8') as logf: - ali_to_post_proc = subprocess.Popen([thirdparty_binary('ali-to-post'), - 'ark:' + ali_path, 'ark:-'], - stderr=logf, - stdout=subprocess.PIPE) - weight_silence_proc = subprocess.Popen([thirdparty_binary('weight-silence-post'), - str(silence_weight), - sil_phones, - mdl_path, - 'ark:-', 'ark:-'], - stderr=logf, - stdin=ali_to_post_proc.stdout, - stdout=subprocess.PIPE) - post_to_weight_proc = subprocess.Popen([thirdparty_binary('post-to-weights'), - 'ark:-', 'ark:' + weight_path], - stderr=logf, - stdin=weight_silence_proc.stdout) - post_to_weight_proc.communicate() - - gmm_global_get_post_proc = subprocess.Popen([thirdparty_binary('gmm-global-get-post'), - '--n=' + str(config.num_gselect), - '--min-post=' + str(config.min_post), - os.path.join(config.train_directory, 'final.dubm'), - 'scp:' + gmm_feats, - 'ark:-'], - stdout=subprocess.PIPE, - stderr=logf) - extract_proc = subprocess.Popen([thirdparty_binary('ivector-extract'), - '--acoustic-weight={}'.format(posterior_scale), - '--compute-objf-change=true', - '--max-count={}'.format(max_count), - '--spk2utt=ark:' + spk2utt_path, - ivector_mdl, - 'scp:' + features_path, - 'ark,s,cs:-', - 'ark,t:' + ivectors_path], - stderr=logf, - stdin=gmm_global_get_post_proc.stdout) - extract_proc.communicate() - utt_ivectors = [] - with open(ivectors_path, 'r', encoding='utf8') as f: - for line in f: - line = line.strip() - if not line: - continue - line = line.split() - speaker = line[0] - data = line[1:] - for utt in config.corpus.speak_utt_mapping[speaker]: - utt_ivectors.append([utt] + data) - - with open(ivectors_path, 'w', newline='', encoding='utf8') as f: - for u in utt_ivectors: - f.write(' '.join(u)) - f.write('\n') - - feat_scp_path = os.path.join(config.data_directory, 'feats.{}.scp'.format(job_id)) - with open(os.devnull, 'w', encoding='utf8') as devnull: - dim_proc = subprocess.Popen([thirdparty_binary('feat-to-dim'), - 'scp:' + feat_scp_path, '-'], - stdout=subprocess.PIPE, - stderr=devnull) - stdout, stderr = dim_proc.communicate() - feat_dim = int(stdout.decode('utf8').strip()) - - ivector_ark_path = os.path.join(config.data_directory, 'ivector.{}.ark'.format(job_id)) - ivector_scp_path = os.path.join(config.data_directory, 'ivector.{}.scp'.format(job_id)) - append_proc = subprocess.Popen([thirdparty_binary('append-vector-to-feats'), - 'scp:' + feat_scp_path, 'ark:' + ivectors_path, - 'ark:-'], - stderr=logf, - stdout=subprocess.PIPE) - select_proc = subprocess.Popen([thirdparty_binary('select-feats'), - "{}-{}".format(feat_dim, feat_dim + config.ivector_dimension - 1), - 'ark:-', - 'ark,scp:{},{}'.format(ivector_ark_path, ivector_scp_path)], - stderr=logf, - stdin=append_proc.stdout) - select_proc.communicate() - - -def extract_ivectors(config, num_jobs): - """ - Multiprocessing function that extracts i-vectors. - - See: - - - http://kaldi-asr.org/doc/ivector-extract-online2_8cc.html - - http://kaldi-asr.org/doc/copy-feats_8cc.html - - for more details - on the Kaldi binary this runs. - - Also see https://github.com/kaldi-asr/kaldi/blob/master/egs/wsj/s5/steps/online/nnet2/extract_ivectors_online.sh - for the original bash script that this function was based on. - - Parameters - ---------- - config : :class:`~aligner.config.iVectorExtractorConfig` - Configuration object for training - num_jobs : int - The number of processes to use in calculation - """ - jobs = [(config, x) for x in range(num_jobs)] - if config.use_mp: - run_mp(extract_ivectors_func, jobs) - else: - run_non_mp(extract_ivectors_func, jobs) diff --git a/montreal_forced_aligner/multiprocessing/__init__.py b/montreal_forced_aligner/multiprocessing/__init__.py new file mode 100644 index 00000000..3a8b4c35 --- /dev/null +++ b/montreal_forced_aligner/multiprocessing/__init__.py @@ -0,0 +1,7 @@ +from .helper import run_mp, run_non_mp, Stopped, Counter +from .alignment import align, compute_alignment_improvement, convert_ali_to_textgrids, compile_information, acc_stats, \ + lda_acc_stats, mono_align_equal, compile_train_graphs, tree_stats, convert_alignments, calc_lda_mllt, calc_fmllr +from .transcription import transcribe, transcribe_fmllr +from .ivector import gmm_gselect, acc_global_stats, acc_ivector_stats, extract_ivectors, gauss_to_post, segment_vad, \ + classify_speakers +from .pronunciations import generate_pronunciations diff --git a/montreal_forced_aligner/multiprocessing/alignment.py b/montreal_forced_aligner/multiprocessing/alignment.py new file mode 100644 index 00000000..0f50decb --- /dev/null +++ b/montreal_forced_aligner/multiprocessing/alignment.py @@ -0,0 +1,1022 @@ +import subprocess +import os +import shutil +import re +from decimal import Decimal +import statistics + +from .helper import make_path_safe, run_mp, run_non_mp, thirdparty_binary + +from ..textgrid import ctm_to_textgrid, parse_ctm + +from ..exceptions import AlignmentError + + +def parse_transitions(path, phones_path): + state_extract_pattern = re.compile(r'Transition-state (\d+): phone = (\w+)') + id_extract_pattern = re.compile(r'Transition-id = (\d+)') + cur_phone = None + current = 0 + with open(path, encoding='utf8') as f, open(phones_path, 'w', encoding='utf8') as outf: + outf.write('{} {}\n'.format('', 0)) + for line in f: + line = line.strip() + if line.startswith('Transition-state'): + m = state_extract_pattern.match(line) + _, phone = m.groups() + if phone != cur_phone: + current = 0 + cur_phone = phone + else: + m = id_extract_pattern.match(line) + id = m.groups()[0] + outf.write('{}_{} {}\n'.format(phone, current, id)) + current += 1 + + +def acc_stats_func(directory, iteration, job_name, feature_string): + log_path = os.path.join(directory, 'log', 'acc.{}.{}.log'.format(iteration, job_name)) + model_path = os.path.join(directory, '{}.mdl'.format(iteration)) + acc_path = os.path.join(directory, '{}.{}.acc'.format(iteration, job_name)) + ali_path = os.path.join(directory, 'ali.{}'.format(job_name)) + with open(log_path, 'w', encoding='utf8') as log_file: + acc_proc = subprocess.Popen([thirdparty_binary('gmm-acc-stats-ali'), model_path, + '{}'.format(feature_string), "ark,t:" + ali_path, acc_path], + stderr=log_file) + acc_proc.communicate() + + +def acc_stats(iteration, directory, split_directory, num_jobs, config): + """ + Multiprocessing function that computes stats for GMM training + + See http://kaldi-asr.org/doc/gmm-acc-stats-ali_8cc.html for more details + on the Kaldi binary this runs. + + Also see https://github.com/kaldi-asr/kaldi/blob/master/egs/wsj/s5/steps/train_mono.sh + for the bash script this function was extracted from + + Parameters + ---------- + iteration : int + Iteration to calculate stats for + directory : str + Directory of training (monophone, triphone, speaker-adapted triphone + training directories) + split_directory : str + Directory of training data split into the number of jobs + num_jobs : int + The number of processes to use in calculation + """ + jobs = [(directory, iteration, x, + config.feature_config.construct_feature_proc_string(split_directory, directory, x) + ) for x in range(num_jobs)] + + if config.use_mp: + run_mp(acc_stats_func, jobs, config.log_directory) + else: + run_non_mp(acc_stats_func, jobs, config.log_directory) + + +def compile_train_graphs_func(directory, lang_directory, split_directory, job_name, debug=True): + fst_path = os.path.join(directory, 'fsts.{}'.format(job_name)) + tree_path = os.path.join(directory, 'tree') + mdl_path = os.path.join(directory, '0.mdl') + if not os.path.exists(mdl_path): + mdl_path = os.path.join(directory, 'final.mdl') + + log_path = os.path.join(directory, 'log', 'show_transition.log') + transition_path = os.path.join(directory, 'transitions.txt') + phones_file_path = os.path.join(lang_directory, 'phones.txt') + + triphones_file_path = os.path.join(directory, 'triphones.txt') + if debug: + with open(log_path, 'w', encoding='utf8') as log_file: + with open(transition_path, 'w', encoding='utf8') as f: + subprocess.call([thirdparty_binary('show-transitions'), phones_file_path, mdl_path], + stdout=f, stderr=log_file) + parse_transitions(transition_path, triphones_file_path) + log_path = os.path.join(directory, 'log', 'compile-graphs.0.{}.log'.format(job_name)) + + if os.path.exists(triphones_file_path): + phones_file_path = triphones_file_path + words_file_path = os.path.join(lang_directory, 'words.txt') + + with open(os.path.join(split_directory, 'text.{}.int'.format(job_name)), 'r', encoding='utf8') as inf, \ + open(fst_path, 'wb') as outf, \ + open(log_path, 'w', encoding='utf8') as log_file: + + proc = subprocess.Popen([thirdparty_binary('compile-train-graphs'), + '--read-disambig-syms={}'.format( + os.path.join(lang_directory, 'phones', 'disambig.int')), + tree_path, mdl_path, + os.path.join(lang_directory, 'L.fst'), + "ark:-", "ark:-"], + stdin=inf, stdout=outf, stderr=log_file) + proc.communicate() + + if debug: + utterances = [] + with open(os.path.join(split_directory, 'utt2spk.{}'.format(job_name)), 'r', encoding='utf8') as f: + for line in f: + utt = line.split()[0].strip() + if not utt: + continue + utterances.append(utt) + + with open(log_path, 'a', encoding='utf8') as log_file: + fst_ark_path = os.path.join(directory, 'fsts.{}.ark'.format(job_name)) + fst_scp_path = os.path.join(directory, 'fsts.{}.scp'.format(job_name)) + proc = subprocess.Popen([thirdparty_binary('fstcopy'), + 'ark:{}'.format(fst_path), + 'ark,scp:{},{}'.format(fst_ark_path, fst_scp_path)], stderr=log_file) + proc.communicate() + + temp_fst_path = os.path.join(directory, 'temp.fst.{}'.format(job_name)) + + with open(fst_scp_path, 'r', encoding='utf8') as f: + for line in f: + line = line.strip() + utt = line.split()[0] + + dot_path = os.path.join(directory, '{}.dot'.format(utt)) + fst_proc = subprocess.Popen([thirdparty_binary('fstcopy'), + 'scp:-', + 'scp:echo {} {}|'.format(utt, temp_fst_path)], + stdin=subprocess.PIPE, stderr=log_file) + fst_proc.communicate(input=line.encode()) + + draw_proc = subprocess.Popen([thirdparty_binary('fstdraw'), '--portrait=true', + '--isymbols={}'.format(phones_file_path), + '--osymbols={}'.format(words_file_path), temp_fst_path, + dot_path], + stderr=log_file) + draw_proc.communicate() + try: + dot_proc = subprocess.Popen([thirdparty_binary('dot'), '-Tpdf', '-O', dot_path], + stderr=log_file) + dot_proc.communicate() + except FileNotFoundError: + pass + + +def compile_train_graphs(directory, lang_directory, split_directory, num_jobs, config, debug=False): + """ + Multiprocessing function that compiles training graphs for utterances + + See http://kaldi-asr.org/doc/compile-train-graphs_8cc.html for more details + on the Kaldi binary this function calls. + + Also see https://github.com/kaldi-asr/kaldi/blob/master/egs/wsj/s5/steps/train_mono.sh + for the bash script that this function was extracted from. + + Parameters + ---------- + directory : str + Directory of training (monophone, triphone, speaker-adapted triphone + training directories) + lang_directory : str + Directory of the language model used + split_directory : str + Directory of training data split into the number of jobs + num_jobs : int + The number of processes to use + """ + log_directory = os.path.join(directory, 'log') + os.makedirs(log_directory, exist_ok=True) + jobs = [(directory, lang_directory, split_directory, x, debug) + for x in range(num_jobs)] + if config.use_mp: + run_mp(compile_train_graphs_func, jobs, log_directory) + else: + run_non_mp(compile_train_graphs_func, jobs, log_directory) + + +def mono_align_equal_func(mono_directory, job_name, feature_string): + fst_path = os.path.join(mono_directory, 'fsts.{}'.format(job_name)) + mdl_path = os.path.join(mono_directory, '0.mdl') + log_path = os.path.join(mono_directory, 'log', 'align.0.{}.log'.format(job_name)) + ali_path = os.path.join(mono_directory, 'ali.{}'.format(job_name)) + acc_path = os.path.join(mono_directory, '0.{}.acc'.format(job_name)) + with open(log_path, 'w', encoding='utf8') as log_file: + align_proc = subprocess.Popen([thirdparty_binary('align-equal-compiled'), "ark:" + fst_path, + '{}'.format(feature_string), 'ark:' + ali_path], + stderr=log_file) + align_proc.communicate() + stats_proc = subprocess.Popen([thirdparty_binary('gmm-acc-stats-ali'), '--binary=true', + mdl_path, '{}'.format(feature_string), 'ark:' + ali_path, acc_path], + stdin=align_proc.stdout, stderr=log_file) + stats_proc.communicate() + + +def mono_align_equal(mono_directory, split_directory, num_jobs, config): + """ + Multiprocessing function that creates equal alignments for base monophone training + + See http://kaldi-asr.org/doc/align-equal-compiled_8cc.html for more details + on the Kaldi binary this function calls. + + Also see https://github.com/kaldi-asr/kaldi/blob/master/egs/wsj/s5/steps/train_mono.sh + for the bash script that this function was extracted from. + + Parameters + ---------- + mono_directory : str + Directory of monophone training + split_directory : str + Directory of training data split into the number of jobs + num_jobs : int + The number of processes to use + """ + + jobs = [(mono_directory, x, + config.feature_config.construct_feature_proc_string(split_directory, mono_directory, x), + ) + for x in range(num_jobs)] + + if config.use_mp: + run_mp(mono_align_equal_func, jobs, config.log_directory) + else: + run_non_mp(mono_align_equal_func, jobs, config.log_directory) + + +def align_func(directory, iteration, job_name, mdl, config, feature_string, output_directory): + fst_path = os.path.join(directory, 'fsts.{}'.format(job_name)) + log_path = os.path.join(output_directory, 'log', 'align.{}.{}.log'.format(iteration, job_name)) + ali_path = os.path.join(output_directory, 'ali.{}'.format(job_name)) + score_path = os.path.join(output_directory, 'ali.{}.scores'.format(job_name)) + with open(log_path, 'w', encoding='utf8') as log_file: + align_proc = subprocess.Popen([thirdparty_binary('gmm-align-compiled'), + '--transition-scale={}'.format(config['transition_scale']), + '--acoustic-scale={}'.format(config['acoustic_scale']), + '--self-loop-scale={}'.format(config['self_loop_scale']), + '--beam={}'.format(config['beam']), + '--retry-beam={}'.format(config['retry_beam']), + '--careful=false', + mdl, + "ark:" + fst_path, '{}'.format(feature_string), "ark,t:" + ali_path, + "ark,t:" + score_path], + stderr=log_file) + align_proc.communicate() + + +def align(iteration, directory, split_directory, optional_silence, num_jobs, config, output_directory=None): + """ + Multiprocessing function that aligns based on the current model + + See http://kaldi-asr.org/doc/gmm-align-compiled_8cc.html and + http://kaldi-asr.org/doc/gmm-boost-silence_8cc.html for more details + on the Kaldi binary this function calls. + + Also see https://github.com/kaldi-asr/kaldi/blob/master/egs/wsj/s5/steps/align_si.sh + for the bash script this function was based on. + + Parameters + ---------- + iteration : int or str + Iteration to align + directory : str + Directory of training (monophone, triphone, speaker-adapted triphone + training directories) + split_directory : str + Directory of training data split into the number of jobs + optional_silence : str + Colon-separated list of silence phones to boost + num_jobs : int + The number of processes to use in calculation + config : :class:`~aligner.config.MonophoneConfig`, :class:`~aligner.config.TriphoneConfig` or :class:`~aligner.config.TriphoneFmllrConfig` + Configuration object for training + """ + if output_directory is None: + output_directory = directory + log_directory = os.path.join(output_directory, 'log') + mdl_path = os.path.join(directory, '{}.mdl'.format(iteration)) + mdl = "{} --boost={} {} {} - |".format(thirdparty_binary('gmm-boost-silence'), + config.boost_silence, optional_silence, make_path_safe(mdl_path)) + + jobs = [(directory, iteration, x, mdl, config.align_options, + config.feature_config.construct_feature_proc_string(split_directory, directory, x), + output_directory) for x in range(num_jobs)] + + if config.use_mp: + run_mp(align_func, jobs, log_directory) + else: + run_non_mp(align_func, jobs, log_directory) + + error_logs = [] + for i in range(num_jobs): + log_path = os.path.join(output_directory, 'log', 'align.{}.{}.log'.format(iteration, i)) + with open(log_path, 'r', encoding='utf8') as f: + for line in f: + if line.strip().startswith('ERROR'): + error_logs.append(log_path) + break + if error_logs: + message = 'There were {} job(s) with errors. For more information, please see the following logs:\n\n{}' + raise (AlignmentError(message.format(len(error_logs), '\n'.join(error_logs)))) + + +def compile_information_func(log_directory, corpus, job_num): + align_path = os.path.join(log_directory, 'align.final.{}.log'.format(job_num)) + unaligned = {} + output_path = os.path.join(log_directory, 'unaligned.{}.log'.format(job_num)) + with open(align_path, 'r', encoding='utf8') as f: + for line in f: + m = re.search(r'Did not successfully decode file (.*?),', line) + if m is not None: + utt = m.groups()[0] + unaligned[utt] = 'Could not decode (beam too narrow)' + features_path = os.path.join(corpus.split_directory(), 'log', 'make_mfcc.{}.log'.format(job_num)) + with open(features_path, 'r', encoding='utf8') as f: + for line in f: + m = re.search(r'Segment (.*?) too short', line) + if m is not None: + utt = m.groups()[0] + unaligned[utt] = 'Too short to get features' + with open(output_path, 'w', encoding='utf8') as f: + for k, v in unaligned.items(): + f.write('{} {}\n'.format(k, v)) + + +def compile_information(model_directory, corpus, num_jobs, config): + log_dir = os.path.join(model_directory, 'log') + + jobs = [(log_dir, corpus, x) + for x in range(num_jobs)] + + run_non_mp(compile_information_func, jobs, log_dir) + + unaligned = {} + for j in jobs: + path = os.path.join(log_dir, 'unaligned.{}.log'.format(j[-1])) + with open(path, 'r', encoding='utf8') as f: + for line in f: + line = line.strip() + utt, reason = line.split(' ', maxsplit=1) + unaligned[utt] = reason + return unaligned + + +def compute_alignment_improvement_func(iteration, config, model_directory, job_name): + try: + text_int_path = os.path.join(config.data_directory, 'text.{}.int'.format(job_name)) + log_path = os.path.join(model_directory, 'log', 'get_ctm.{}.{}.log'.format(iteration, job_name)) + ali_path = os.path.join(model_directory, 'ali.{}'.format(job_name)) + model_path = os.path.join(model_directory, '{}.mdl'.format(iteration)) + phone_ctm_path = os.path.join(model_directory, 'phone.{}.{}.ctm'.format(iteration, job_name)) + if os.path.exists(phone_ctm_path): + return + + frame_shift = config.feature_config.frame_shift / 1000 + with open(log_path, 'w', encoding='utf8') as log_file: + lin_proc = subprocess.Popen([thirdparty_binary('linear-to-nbest'), "ark:" + ali_path, + "ark:" + text_int_path, + '', '', 'ark:-'], + stdout=subprocess.PIPE, stderr=log_file) + det_proc = subprocess.Popen([thirdparty_binary('lattice-determinize-pruned'), + 'ark:-', 'ark:-'], + stdin=lin_proc.stdout, stderr=log_file, + stdout=subprocess.PIPE) + align_proc = subprocess.Popen([thirdparty_binary('lattice-align-words'), + os.path.join(config.dictionary.phones_dir, 'word_boundary.int'), model_path, + 'ark:-', 'ark:-'], + stdin=det_proc.stdout, stderr=log_file, + stdout=subprocess.PIPE) + phone_proc = subprocess.Popen([thirdparty_binary('lattice-to-phone-lattice'), model_path, + 'ark:-', "ark:-"], + stdin=align_proc.stdout, + stdout=subprocess.PIPE, + stderr=log_file) + nbest_proc = subprocess.Popen([thirdparty_binary('nbest-to-ctm'), + '--frame-shift={}'.format(frame_shift), + "ark:-", phone_ctm_path], + stdin=phone_proc.stdout, + stderr=log_file) + nbest_proc.communicate() + mapping = config.dictionary.reversed_phone_mapping + actual_lines = [] + with open(phone_ctm_path, 'r', encoding='utf8') as f: + for line in f: + line = line.strip() + if line == '': + continue + line = line.split(' ') + utt = line[0] + begin = Decimal(line[2]) + duration = Decimal(line[3]) + end = begin + duration + label = line[4] + try: + label = mapping[int(label)] + except KeyError: + pass + for p in config.dictionary.positions: + if label.endswith(p): + label = label[:-1 * len(p)] + actual_lines.append([utt, begin, end, label]) + with open(phone_ctm_path, 'w', encoding='utf8') as f: + for line in actual_lines: + f.write('{}\n'.format(' '.join(map(str, line)))) + except Exception as e: + raise (Exception(str(e))) + + +def parse_iteration_alignments(directory, iteration, num_jobs): + data = {} + for j in range(num_jobs): + phone_ctm_path = os.path.join(directory, 'phone.{}.{}.ctm'.format(iteration, j)) + with open(phone_ctm_path, 'r', encoding='utf8') as f: + for line in f: + line = line.strip() + if line == '': + continue + line = line.split(' ') + utt = line[0] + begin = Decimal(line[1]) + end = Decimal(line[2]) + label = line[3] + if utt not in data: + data[utt] = [] + data[utt].append([begin, end, label]) + return data + + +def compare_alignments(alignments_one, alignments_two, frame_shift): + utterances_aligned_diff = len(alignments_two) - len(alignments_one) + utts_one = set(alignments_one.keys()) + utts_two = set(alignments_two.keys()) + common_utts = utts_one.intersection(utts_two) + differences = [] + for u in common_utts: + end = alignments_one[u][-1][1] + t = Decimal('0.0') + one_alignment = alignments_one[u] + two_alignment = alignments_two[u] + difference = 0 + while t < end: + one_label = None + two_label = None + for b, e, l in one_alignment: + if t < b: + continue + if t >= e: + break + one_label = l + for b, e, l in two_alignment: + if t < b: + continue + if t >= e: + break + two_label = l + if one_label != two_label: + difference += frame_shift + t += frame_shift + difference /= end + differences.append(difference) + if differences: + mean_difference = statistics.mean(differences) + else: + mean_difference = 'N/A' + return utterances_aligned_diff, mean_difference + + +def compute_alignment_improvement(iteration, config, model_directory, num_jobs): + jobs = [(iteration, config, model_directory, x) for x in range(num_jobs)] + if config.use_mp: + run_mp(compute_alignment_improvement_func, jobs, config.log_directory) + else: + run_non_mp(compute_alignment_improvement_func, jobs, config.log_directory) + + alignment_diff_path = os.path.join(model_directory, 'train_change.csv') + if iteration == 0 or iteration not in config.realignment_iterations: + return + ind = config.realignment_iterations.index(iteration) + if ind != 0: + previous_iteration = config.realignment_iterations[ind - 1] + else: + previous_iteration = 0 + try: + previous_alignments = parse_iteration_alignments(model_directory, previous_iteration, num_jobs) + except FileNotFoundError: + return + current_alignments = parse_iteration_alignments(model_directory, iteration, num_jobs) + utterance_aligned_diff, mean_difference = compare_alignments(previous_alignments, current_alignments, + config.feature_config.frame_shift) + if not os.path.exists(alignment_diff_path): + with open(alignment_diff_path, 'w', encoding='utf8') as f: + f.write('iteration,number_aligned,number_previously_aligned,' + 'difference_in_utts_aligned,mean_boundary_change\n') + if iteration in config.realignment_iterations: + with open(alignment_diff_path, 'a', encoding='utf8') as f: + f.write('{},{},{},{},{}\n'.format(iteration, len(current_alignments), + len(previous_alignments), utterance_aligned_diff, mean_difference)) + if not config.debug: + for j in range(num_jobs): + phone_ctm_path = os.path.join(model_directory, 'phone.{}.{}.ctm'.format(previous_iteration, j)) + os.remove(phone_ctm_path) + + +def ali_to_textgrid_func(model_directory, word_path, split_directory, job_name, frame_shift): + text_int_path = os.path.join(split_directory, 'text.{}.int'.format(job_name)) + log_path = os.path.join(model_directory, 'log', 'get_ctm_align.{}.log'.format(job_name)) + ali_path = os.path.join(model_directory, 'ali.{}'.format(job_name)) + model_path = os.path.join(model_directory, 'final.mdl') + aligned_path = os.path.join(model_directory, 'aligned.{}'.format(job_name)) + nbest_path = os.path.join(model_directory, 'nbest.{}'.format(job_name)) + word_ctm_path = os.path.join(model_directory, 'word_ctm.{}'.format(job_name)) + phone_ctm_path = os.path.join(model_directory, 'phone_ctm.{}'.format(job_name)) + + with open(log_path, 'w', encoding='utf8') as log_file: + lin_proc = subprocess.Popen([thirdparty_binary('linear-to-nbest'), "ark:" + ali_path, + "ark:" + text_int_path, + '', '', 'ark,t:' + nbest_path], + stdout=subprocess.PIPE, stderr=log_file) + + lin_proc.communicate() + lin_proc = subprocess.Popen([thirdparty_binary('linear-to-nbest'), "ark:" + ali_path, + "ark:" + text_int_path, + '', '', 'ark:-'], + stdout=subprocess.PIPE, stderr=log_file) + det_proc = subprocess.Popen([thirdparty_binary('lattice-determinize-pruned'), + 'ark:-', 'ark:-'], + stdin=lin_proc.stdout, stderr=log_file, + stdout=subprocess.PIPE) + align_proc = subprocess.Popen([thirdparty_binary('lattice-align-words'), + word_path, model_path, + 'ark:-', 'ark,t:' + aligned_path], + stdin=det_proc.stdout, stderr=log_file) + align_proc.communicate() + + subprocess.call([thirdparty_binary('nbest-to-ctm'), + '--frame-shift={}'.format(frame_shift), + 'ark:' + aligned_path, + word_ctm_path], + stderr=log_file) + phone_proc = subprocess.Popen([thirdparty_binary('lattice-to-phone-lattice'), model_path, + 'ark:' + aligned_path, "ark:-"], + stdout=subprocess.PIPE, + stderr=log_file) + nbest_proc = subprocess.Popen([thirdparty_binary('nbest-to-ctm'), + '--frame-shift={}'.format(frame_shift), + "ark:-", phone_ctm_path], + stdin=phone_proc.stdout, + stderr=log_file) + nbest_proc.communicate() + + +def convert_ali_to_textgrids(align_config, output_directory, model_directory, dictionary, corpus, num_jobs, config): + """ + Multiprocessing function that aligns based on the current model + + See: + + - http://kaldi-asr.org/doc/linear-to-nbest_8cc.html + - http://kaldi-asr.org/doc/lattice-align-words_8cc.html + - http://kaldi-asr.org/doc/lattice-to-phone-lattice_8cc.html + - http://kaldi-asr.org/doc/nbest-to-ctm_8cc.html + + for more details + on the Kaldi binaries this function calls. + + Also see https://github.com/kaldi-asr/kaldi/blob/master/egs/wsj/s5/steps/get_train_ctm.sh + for the bash script that this function was based on. + + Parameters + ---------- + output_directory : str + Directory to write TextGrid files to + model_directory : str + Directory of training (monophone, triphone, speaker-adapted triphone + training directories) + dictionary : :class:`~montreal_forced_aligner.dictionary.Dictionary` + Dictionary object that has information about pronunciations + corpus : :class:`~montreal_forced_aligner.corpus.AlignableCorpus` + Corpus object that has information about the dataset + num_jobs : int + The number of processes to use in calculation + + Raises + ------ + CorpusError + If the files per speaker exceeds the number of files that are + allowed to be open on the computer (for Unix-based systems) + + """ + log_directory = os.path.join(model_directory, 'log') + frame_shift = align_config.feature_config.frame_shift / 1000 + word_path = os.path.join(dictionary.phones_dir, 'word_boundary.int') + jobs = [(model_directory, word_path, corpus.split_directory(), x, frame_shift) + for x in range(num_jobs)] + if align_config.use_mp: + run_mp(ali_to_textgrid_func, jobs, log_directory) + else: + run_non_mp(ali_to_textgrid_func, jobs, log_directory) + + word_ctm = {} + phone_ctm = {} + for i in range(num_jobs): + word_ctm_path = os.path.join(model_directory, 'word_ctm.{}'.format(i)) + phone_ctm_path = os.path.join(model_directory, 'phone_ctm.{}'.format(i)) + if not os.path.exists(word_ctm_path): + continue + parsed = parse_ctm(word_ctm_path, corpus, dictionary, mode='word') + for k, v in parsed.items(): + if k not in word_ctm: + word_ctm[k] = v + else: + word_ctm[k].update(v) + parsed = parse_ctm(phone_ctm_path, corpus, dictionary, mode='phone') + for k, v in parsed.items(): + if k not in phone_ctm: + phone_ctm[k] = v + else: + phone_ctm[k].update(v) + ctm_to_textgrid(word_ctm, phone_ctm, output_directory, corpus, dictionary) + + +def tree_stats_func(directory, ci_phones, mdl, feature_string, ali_path, job_name): + context_opts = [] + log_path = os.path.join(directory, 'log', 'acc_tree.{}.log'.format(job_name)) + + treeacc_path = os.path.join(directory, '{}.treeacc'.format(job_name)) + + with open(log_path, 'w', encoding='utf8') as log_file: + subprocess.call([thirdparty_binary('acc-tree-stats')] + context_opts + + ['--ci-phones=' + ci_phones, mdl, '{}'.format(feature_string), + "ark:" + ali_path, + treeacc_path], stderr=log_file) + + +def tree_stats(directory, align_directory, split_directory, ci_phones, num_jobs, config): + """ + Multiprocessing function that computes stats for decision tree training + + See http://kaldi-asr.org/doc/acc-tree-stats_8cc.html for more details + on the Kaldi binary this runs. + + Parameters + ---------- + directory : str + Directory of training (triphone, speaker-adapted triphone + training directories) + align_directory : str + Directory of previous alignment + split_directory : str + Directory of training data split into the number of jobs + ci_phones : str + Colon-separated list of context-independent phones + num_jobs : int + The number of processes to use in calculation + """ + + mdl_path = os.path.join(align_directory, 'final.mdl') + + jobs = [(directory, ci_phones, mdl_path, + config.feature_config.construct_feature_proc_string(split_directory, directory, x), + os.path.join(align_directory, 'ali.{}'.format(x)), x) for x in range(num_jobs)] + + if config.use_mp: + run_mp(tree_stats_func, jobs, config.log_directory) + else: + run_non_mp(tree_stats_func, jobs, config.log_directory) + + tree_accs = [os.path.join(directory, '{}.treeacc'.format(x)) for x in range(num_jobs)] + log_path = os.path.join(directory, 'log', 'sum_tree_acc.log') + with open(log_path, 'w', encoding='utf8') as log_file: + subprocess.call([thirdparty_binary('sum-tree-stats'), os.path.join(directory, 'treeacc')] + + tree_accs, stderr=log_file) + # for f in tree_accs: + # os.remove(f) + + +def convert_alignments_func(directory, align_directory, job_name): + mdl_path = os.path.join(directory, '1.mdl') + tree_path = os.path.join(directory, 'tree') + ali_mdl_path = os.path.join(align_directory, 'final.mdl') + ali_path = os.path.join(align_directory, 'ali.{}'.format(job_name)) + new_ali_path = os.path.join(directory, 'ali.{}'.format(job_name)) + + log_path = os.path.join(directory, 'log', 'convert.{}.log'.format(job_name)) + with open(log_path, 'w', encoding='utf8') as log_file: + subprocess.call([thirdparty_binary('convert-ali'), ali_mdl_path, + mdl_path, tree_path, "ark:" + ali_path, + "ark:" + new_ali_path], stderr=log_file) + + +def convert_alignments(directory, align_directory, num_jobs, config): + """ + Multiprocessing function that converts alignments from previous training + + See http://kaldi-asr.org/doc/convert-ali_8cc.html for more details + on the Kaldi binary this runs. + + Parameters + ---------- + directory : str + Directory of training (triphone, speaker-adapted triphone + training directories) + align_directory : str + Directory of previous alignment + num_jobs : int + The number of processes to use in calculation + + """ + + jobs = [(directory, align_directory, x) + for x in range(num_jobs)] + if config.use_mp: + run_mp(convert_alignments_func, jobs, config.log_directory) + else: + run_non_mp(convert_alignments_func, jobs, config.log_directory) + + +def calc_fmllr_func(directory, split_directory, sil_phones, job_name, feature_string, config, initial, + model_name='final'): + + log_path = os.path.join(directory, 'log', 'fmllr.{}.{}.log'.format(model_name, job_name)) + ali_path = os.path.join(directory, 'ali.{}'.format(job_name)) + mdl_path = os.path.join(directory, '{}.mdl'.format(model_name)) + spk2utt_path = os.path.join(split_directory, 'spk2utt.{}'.format(job_name)) + if not initial: + tmp_trans_path = os.path.join(directory, 'trans.temp.{}'.format(job_name)) + trans_path = os.path.join(directory, 'trans.{}'.format(job_name)) + cmp_trans_path = os.path.join(directory, 'trans.cmp.{}'.format(job_name)) + else: + tmp_trans_path = os.path.join(directory, 'trans.{}'.format(job_name)) + post_path = os.path.join(directory, 'post.{}'.format(job_name)) + weight_path = os.path.join(directory, 'weight.{}'.format(job_name)) + with open(log_path, 'w', encoding='utf8') as log_file: + subprocess.call([thirdparty_binary('ali-to-post'), + "ark:" + ali_path, 'ark:' + post_path], stderr=log_file) + + subprocess.call([thirdparty_binary('weight-silence-post'), '0.0', + sil_phones, mdl_path, 'ark:' + post_path, + 'ark:' + weight_path], stderr=log_file) + + subprocess.call([thirdparty_binary('gmm-est-fmllr'), + '--verbose=4', + '--fmllr-update-type={}'.format(config.fmllr_update_type), + '--spk2utt=ark:' + spk2utt_path, mdl_path, '{}'.format(feature_string), + 'ark,s,cs:' + weight_path, 'ark:' + tmp_trans_path], + stderr=log_file) + + if not initial: + subprocess.call([thirdparty_binary('compose-transforms'), + '--b-is-affine=true', + 'ark:' + tmp_trans_path, 'ark:' + trans_path, + 'ark:' + cmp_trans_path], stderr=log_file) + os.remove(tmp_trans_path) + os.remove(trans_path) + os.rename(cmp_trans_path, trans_path) + else: + trans_path = tmp_trans_path + + +def calc_fmllr(directory, split_directory, sil_phones, num_jobs, config, + initial=False, iteration=None): + """ + Multiprocessing function that computes speaker adaptation (fMLLR) + + See: + + - http://kaldi-asr.org/doc/gmm-est-fmllr_8cc.html + - http://kaldi-asr.org/doc/ali-to-post_8cc.html + - http://kaldi-asr.org/doc/weight-silence-post_8cc.html + - http://kaldi-asr.org/doc/compose-transforms_8cc.html + - http://kaldi-asr.org/doc/transform-feats_8cc.html + + for more details + on the Kaldi binary this runs. + + Also see https://github.com/kaldi-asr/kaldi/blob/master/egs/wsj/s5/steps/align_fmllr.sh + for the original bash script that this function was based on. + + Parameters + ---------- + directory : str + Directory of training (triphone, speaker-adapted triphone + training directories) + split_directory : str + Directory of training data split into the number of jobs + sil_phones : str + Colon-separated list of silence phones + num_jobs : int + The number of processes to use in calculation + config : :class:`~aligner.config.TriphoneFmllrConfig` + Configuration object for training + initial : bool, optional + Whether this is the first computation of speaker-adaptation, + defaults to False + iteration : int + Specifies the current iteration, defaults to None + + """ + if iteration is None: + if initial: + model_name = '1' + else: + model_name = 'final' + else: + model_name = iteration + + + jobs = [(directory, split_directory, sil_phones, x, + config.feature_config.construct_feature_proc_string(split_directory, directory, x), + config, initial, model_name) for x in range(num_jobs)] + # if config.use_mp: + # run_mp(calc_fmllr_func, jobs) + # else: + run_non_mp(calc_fmllr_func, jobs, config.log_directory) + + +def lda_acc_stats_func(directory, feature_string, align_directory, config, ci_phones, i): + log_path = os.path.join(directory, 'log', 'ali_to_post.{}.log'.format(i)) + with open(log_path, 'w', encoding='utf8') as log_file: + ali_to_post_proc = subprocess.Popen([thirdparty_binary('ali-to-post'), + 'ark:' + os.path.join(align_directory, 'ali.{}'.format(i)), + 'ark:-'], + stderr=log_file, stdout=subprocess.PIPE) + weight_silence_post_proc = subprocess.Popen([thirdparty_binary('weight-silence-post'), + str(config['boost_silence']), ci_phones, + os.path.join(align_directory, 'final.mdl'), + 'ark:-', 'ark:-'], + stdin=ali_to_post_proc.stdout, + stderr=log_file, stdout=subprocess.PIPE) + acc_lda_post_proc = subprocess.Popen([thirdparty_binary('acc-lda'), + '--rand-prune=' + str(config['random_prune']), + os.path.join(align_directory, 'final.mdl'), + '{}'.format(feature_string), + 'ark,s,cs:-', + os.path.join(directory, 'lda.{}.acc'.format(i))], + stdin=weight_silence_post_proc.stdout, + stderr=log_file) + acc_lda_post_proc.communicate() + + +def lda_acc_stats(directory, split_directory, align_directory, config, ci_phones, num_jobs): + """ + Multiprocessing function that accumulates LDA statistics + + See: + + - http://kaldi-asr.org/doc/ali-to-post_8cc.html + - http://kaldi-asr.org/doc/weight-silence-post_8cc.html + - http://kaldi-asr.org/doc/acc-lda_8cc.html + - http://kaldi-asr.org/doc/est-lda_8cc.html + + for more details + on the Kaldi binary this runs. + + Also see https://github.com/kaldi-asr/kaldi/blob/master/egs/wsj/s5/steps/train_lda_mllt.sh + for the original bash script that this function was based on. + + Parameters + ---------- + directory : str + Directory of LDA+MLLT training + split_directory : str + Directory of training data split into the number of jobs + align_directory : str + Directory of previous alignment + config : :class:`~aligner.config.LdaMlltConfig` + Configuration object for training + ci_phones : str + Colon-separated list of context-independent phones + num_jobs : int + The number of processes to use in calculation + + """ + jobs = [(directory, + config.feature_config.construct_feature_proc_string(split_directory, directory, x, splice=True), + align_directory, config.lda_options, ci_phones, x) for x in range(num_jobs)] + + + if config.use_mp: + run_mp(lda_acc_stats_func, jobs, config.log_directory) + else: + run_non_mp(lda_acc_stats_func, jobs, config.log_directory) + + log_path = os.path.join(directory, 'log', 'lda_est.log') + acc_list = [] + for x in range(num_jobs): + acc_list.append(os.path.join(directory, 'lda.{}.acc'.format(x))) + with open(log_path, 'w', encoding='utf8') as log_file: + est_lda_proc = subprocess.Popen([thirdparty_binary('est-lda'), + '--write-full-matrix=' + os.path.join(directory, 'full.mat'), + '--dim=' + str(config.lda_dimension), + os.path.join(directory, 'lda.mat')] + acc_list, + stderr=log_file) + est_lda_proc.communicate() + + +def calc_lda_mllt_func(directory, feature_string, sil_phones, job_name, config, + initial, + model_name='final'): + log_path = os.path.join(directory, 'log', 'lda_mllt.{}.{}.log'.format(model_name, job_name)) + ali_path = os.path.join(directory, 'ali.{}'.format(job_name)) + if not initial: + mdl_path = os.path.join(directory, '{}.mdl'.format(model_name)) + else: + mdl_path = os.path.join(directory, '1.mdl') + model_name = 1 + + # Estimating MLLT + with open(log_path, 'a', encoding='utf8') as log_file: + post_proc = subprocess.Popen([thirdparty_binary('ali-to-post'), + "ark:" + ali_path, 'ark:-'], + stdout=subprocess.PIPE, stderr=log_file) + + weight_proc = subprocess.Popen([thirdparty_binary('weight-silence-post'), '0.0', + sil_phones, mdl_path, 'ark:-', + 'ark:-'], + stdin=post_proc.stdout, stdout=subprocess.PIPE, stderr=log_file) + acc_proc = subprocess.Popen([thirdparty_binary('gmm-acc-mllt'), + '--rand-prune=' + str(config['random_prune']), + mdl_path, + '{}'.format(feature_string), + 'ark:-', + os.path.join(directory, '{}.{}.macc'.format(model_name, job_name))], + stdin=weight_proc.stdout, stderr=log_file) + acc_proc.communicate() + + +def calc_lda_mllt(directory, data_directory, sil_phones, num_jobs, config, + initial=False, iteration=None): + """ + Multiprocessing function that calculates LDA+MLLT transformations + + See: + + - http://kaldi-asr.org/doc/ali-to-post_8cc.html + - http://kaldi-asr.org/doc/weight-silence-post_8cc.html + - http://kaldi-asr.org/doc/gmm-acc-mllt_8cc.html + - http://kaldi-asr.org/doc/est-mllt_8cc.html + - http://kaldi-asr.org/doc/gmm-transform-means_8cc.html + - http://kaldi-asr.org/doc/compose-transforms_8cc.html + + for more details + on the Kaldi binary this runs. + + Also see https://github.com/kaldi-asr/kaldi/blob/master/egs/wsj/s5/steps/train_lda_mllt.sh + for the original bash script that this function was based on. + + Parameters + ---------- + directory : str + Directory of LDA+MLLT training + data_directory : str + Directory of training data split into the number of jobs + sil_phones : str + Colon-separated list of silence phones + num_jobs : int + The number of processes to use in calculation + config : :class:`~aligner.config.LdaMlltConfig` + Configuration object for training + initial : bool + Flag for first iteration + iteration : int + Current iteration + + """ + if iteration is None: + model_name = 'final' + else: + model_name = iteration + jobs = [(directory, + config.feature_config.construct_feature_proc_string(data_directory, directory, x), + sil_phones, x, config.lda_options, initial, model_name) for x in range(num_jobs)] + + if config.use_mp: + run_mp(calc_lda_mllt_func, jobs, config.log_directory) + else: + run_non_mp(calc_lda_mllt_func, jobs, config.log_directory) + + mdl_path = os.path.join(directory, '{}.mdl'.format(model_name)) + log_path = os.path.join(directory, 'log', 'transform_means.{}.log'.format(model_name)) + previous_mat_path = os.path.join(directory, 'lda.mat') + new_mat_path = os.path.join(directory, 'lda_new.mat') + composed_path = os.path.join(directory, 'lda_composed.mat') + with open(log_path, 'a', encoding='utf8') as log_file: + macc_list = [] + for x in range(num_jobs): + macc_list.append(os.path.join(directory, '{}.{}.macc'.format(model_name, x))) + subprocess.call([thirdparty_binary('est-mllt'), + new_mat_path] + + macc_list, + stderr=log_file) + subprocess.call([thirdparty_binary('gmm-transform-means'), + new_mat_path, + mdl_path, mdl_path], + stderr=log_file) + + if os.path.exists(previous_mat_path): + subprocess.call([thirdparty_binary('compose-transforms'), + new_mat_path, + previous_mat_path, + composed_path], + stderr=log_file) + os.remove(previous_mat_path) + os.rename(composed_path, previous_mat_path) + else: + os.rename(new_mat_path, previous_mat_path) + diff --git a/montreal_forced_aligner/multiprocessing/corpus.py b/montreal_forced_aligner/multiprocessing/corpus.py new file mode 100644 index 00000000..31e6e04a --- /dev/null +++ b/montreal_forced_aligner/multiprocessing/corpus.py @@ -0,0 +1,229 @@ +import multiprocessing as mp +from queue import Empty +import traceback +import sys +import os +from textgrid import TextGrid, IntervalTier + +from ..helper import parse_logs, thirdparty_binary, make_path_safe, load_text + +from ..dictionary import sanitize + +from ..exceptions import SampleRateError, CorpusError, WavReadError, SampleRateMismatchError, \ + BitDepthError, TextParseError, TextGridParseError + +from ..corpus.base import extract_temp_channels, get_wav_info + + +def parse_transcription(text): + words = [sanitize(x) for x in text.split()] + words = [x for x in words if x not in ['', '-', "'"]] + return words + + +def parse_wav_file(utt_name, wav_path, lab_path, relative_path, speaker_characters, temp_directory=None): + root = os.path.dirname(wav_path) + try: + wav_info = get_wav_info(wav_path) + sr = wav_info['sample_rate'] + except Exception: + raise WavReadError(wav_path) + if sr < 16000: + raise SampleRateError(wav_path) + bit_depth = wav_info['bit_depth'] + if bit_depth != 16: + raise BitDepthError(wav_path) + if not speaker_characters: + speaker_name = os.path.basename(root) + elif isinstance(speaker_characters, int): + speaker_name = utt_name[:speaker_characters] + elif speaker_characters == 'prosodylab': + speaker_name = utt_name.split('_')[1] + else: + speaker_name = utt_name + speaker_name = speaker_name.strip().replace(' ', '_') + utt_name = utt_name.strip().replace(' ', '_') + return {'utt_name': utt_name, 'speaker_name': speaker_name, 'wav_path': wav_path, + 'wav_info': wav_info, 'relative_path': relative_path} + + +def parse_lab_file(utt_name, wav_path, lab_path, relative_path, speaker_characters, temp_directory=None): + root = os.path.dirname(wav_path) + try: + wav_info = get_wav_info(wav_path) + sr = wav_info['sample_rate'] + except Exception: + raise WavReadError(wav_path) + if sr < 16000: + raise SampleRateError(wav_path) + bit_depth = wav_info['bit_depth'] + if bit_depth != 16: + raise BitDepthError(wav_path) + try: + text = load_text(lab_path) + except UnicodeDecodeError: + raise TextParseError(lab_path) + words = parse_transcription(text) + if not words: + raise TextParseError(lab_path) + if not speaker_characters: + speaker_name = os.path.basename(root) + elif isinstance(speaker_characters, int): + speaker_name = utt_name[:speaker_characters] + elif speaker_characters == 'prosodylab': + speaker_name = utt_name.split('_')[1] + else: + speaker_name = utt_name + speaker_name = speaker_name.strip().replace(' ', '_') + utt_name = utt_name.strip().replace(' ', '_') + return {'utt_name': utt_name, 'speaker_name': speaker_name, 'text_file': lab_path, 'wav_path':wav_path, + 'words': words, 'wav_info': wav_info, 'relative_path': relative_path} + +def parse_textgrid_file(recording_name, wav_path, textgrid_path, relative_path, speaker_characters, temp_directory): + file_name = recording_name + try: + wav_info = get_wav_info(wav_path) + sr = wav_info['sample_rate'] + except Exception: + raise WavReadError(wav_path) + if sr < 16000: + raise SampleRateError(wav_path) + bit_depth = wav_info['bit_depth'] + wav_max_time = wav_info['duration'] + if bit_depth != 16: + raise BitDepthError(wav_path) + tg = TextGrid() + try: + tg.read(textgrid_path) + except Exception as e: + exc_type, exc_value, exc_traceback = sys.exc_info() + raise TextGridParseError(textgrid_path, '\n'.join(traceback.format_exception(exc_type, exc_value, exc_traceback))) + n_channels = wav_info['num_channels'] + num_tiers = len(tg.tiers) + if n_channels == 2: + a_name = file_name + "_A" + b_name = file_name + "_B" + + a_path, b_path = extract_temp_channels(wav_path, temp_directory) + elif n_channels > 2: + raise (Exception('More than two channels')) + speaker_ordering = [] + if speaker_characters: + if isinstance(speaker_characters, int): + speaker_name = file_name[:speaker_characters] + elif speaker_characters == 'prosodylab': + speaker_name = file_name.split('_')[1] + else: + speaker_name = file_name + speaker_name = speaker_name.strip().replace(' ', '_') + speaker_ordering.append(speaker_name) + segments = {} + utt_wav_mapping = {} + text_mapping = {} + utt_text_file_mapping = {} + utt_speak_mapping = {} + speak_utt_mapping = {} + file_directory_mapping = {} + for i, ti in enumerate(tg.tiers): + if ti.name.lower() == 'notes': + continue + if not isinstance(ti, IntervalTier): + continue + if not speaker_characters: + speaker_name = ti.name.strip().replace(' ', '_') + speaker_ordering.append(speaker_name) + for interval in ti: + text = interval.mark.lower().strip() + words = parse_transcription(text) + if not words: + continue + begin, end = round(interval.minTime, 4), round(interval.maxTime, 4) + if end > wav_max_time: + end = wav_max_time + utt_name = '{}_{}_{}_{}'.format(speaker_name, file_name, begin, end) + utt_name = utt_name.strip().replace(' ', '_').replace('.', '_') + if n_channels == 1: + segments[utt_name] = '{} {} {}'.format(file_name, begin, end) + utt_wav_mapping[file_name] = wav_path + else: + if i < num_tiers / 2: + utt_name += '_A' + segments[utt_name] = '{} {} {}'.format(a_name, begin, end) + utt_wav_mapping[a_name] = a_path + else: + utt_name += '_B' + segments[utt_name] = '{} {} {}'.format(b_name, begin, end) + utt_wav_mapping[b_name] = b_path + text_mapping[utt_name] = ' '.join(words) + utt_text_file_mapping[utt_name] = textgrid_path + utt_speak_mapping[utt_name] = speaker_name + if speaker_name not in speak_utt_mapping: + speak_utt_mapping[speaker_name] = [] + speak_utt_mapping[speaker_name].append(utt_name) + file_names = [file_name] + if n_channels == 2: + file_names.append(a_name) + file_names.append(b_name) + return {'text_file': textgrid_path, 'wav_path':wav_path, 'wav_info': wav_info, 'segments': segments, + 'utt_wav_mapping': utt_wav_mapping, 'text_mapping': text_mapping, + 'utt_text_file_mapping': utt_text_file_mapping, 'utt_speak_mapping': utt_speak_mapping, + 'speak_utt_mapping': speak_utt_mapping, 'speaker_ordering': speaker_ordering, + 'file_names': file_names, 'relative_path': relative_path, 'recording_name': recording_name + } + + +class CorpusProcessWorker(mp.Process): + def __init__(self, job_q, return_dict, return_q, stopped, initializing=True): + mp.Process.__init__(self) + self.job_q = job_q + self.return_dict = return_dict + self.return_q = return_q + self.stopped = stopped + self.initializing = initializing + + def run(self): + while True: + try: + arguments = self.job_q.get(timeout=1) + except Empty as error: + if self.initializing and not self.stopped.stop_check(): + continue + else: + break + self.initializing = False + self.job_q.task_done() + wav_path = arguments[1] + transcription_path = arguments[2] + + try: + if transcription_path is None: + info = parse_wav_file(*arguments) + elif transcription_path.lower().endswith('.textgrid'): + info = parse_textgrid_file(*arguments) + else: + info = parse_lab_file(*arguments) + self.return_q.put(info) + except WavReadError: + if 'wav_read_errors' not in self.return_dict: + self.return_dict['wav_read_errors'] = [] + self.return_dict['wav_read_errors'].append(wav_path) + except SampleRateError: + if 'unsupported_sample_rate' not in self.return_dict: + self.return_dict['unsupported_sample_rate'] = [] + self.return_dict['unsupported_sample_rate'].append(wav_path) + except BitDepthError: + if 'unsupported_bit_depths' not in self.return_dict: + self.return_dict['unsupported_bit_depths'] = [] + self.return_dict['unsupported_bit_depths'].append(wav_path) + except TextParseError: + if 'decode_error_files' not in self.return_dict: + self.return_dict['decode_error_files'] = [] + self.return_dict['decode_error_files'].append(transcription_path) + except TextGridParseError as e: + if 'textgrid_read_errors' not in self.return_dict: + self.return_dict['textgrid_read_errors'] = {} + self.return_dict['textgrid_read_errors'][transcription_path] = e.error + except Exception as e: + self.stopped.stop() + self.return_dict['error'] = arguments, Exception(traceback.format_exception(*sys.exc_info())) + return \ No newline at end of file diff --git a/montreal_forced_aligner/multiprocessing/helper.py b/montreal_forced_aligner/multiprocessing/helper.py new file mode 100644 index 00000000..44c67108 --- /dev/null +++ b/montreal_forced_aligner/multiprocessing/helper.py @@ -0,0 +1,89 @@ +import multiprocessing as mp +from queue import Empty +import traceback +import sys + +from ..helper import parse_logs, thirdparty_binary, make_path_safe + + +class Counter(object): + def __init__(self, initval=0): + self.val = mp.Value('i', initval) + self.lock = mp.Lock() + + def increment(self): + with self.lock: + self.val.value += 1 + + def value(self): + with self.lock: + return self.val.value + + +class Stopped(object): + def __init__(self, initval=False): + self.val = mp.Value('i', initval) + self.lock = mp.Lock() + + def stop(self): + with self.lock: + self.val.value = True + + def stop_check(self): + with self.lock: + return self.val.value + + +class ProcessWorker(mp.Process): + def __init__(self, job_q, function, return_dict, stopped): + mp.Process.__init__(self) + self.function = function + self.job_q = job_q + self.return_dict = return_dict + self.stopped = stopped + + def run(self): + while True: + try: + arguments = self.job_q.get(timeout=1) + except Empty as error: + break + self.job_q.task_done() + if self.stopped.stop_check(): + continue + try: + _ = self.function(*arguments) + except Exception as e: + self.stopped.stop() + self.return_dict['error'] = arguments, Exception(traceback.format_exception(*sys.exc_info())) + return + + +def run_non_mp(function, argument_list, log_directory): + for args in argument_list: + function(*args) + + parse_logs(log_directory) + + +def run_mp(function, argument_list, log_directory): # pragma: no cover + stopped = Stopped() + manager = mp.Manager() + job_queue = manager.Queue() + return_dict = manager.dict() + for a in argument_list: + job_queue.put(a, False) + procs = [] + for i in range(len(argument_list)): + p = ProcessWorker(job_queue, function, return_dict, stopped) + procs.append(p) + p.start() + + for p in procs: + p.join() + if 'error' in return_dict: + element, exc = return_dict['error'] + print(element) + raise exc + + parse_logs(log_directory) diff --git a/montreal_forced_aligner/multiprocessing/ivector.py b/montreal_forced_aligner/multiprocessing/ivector.py new file mode 100644 index 00000000..66477394 --- /dev/null +++ b/montreal_forced_aligner/multiprocessing/ivector.py @@ -0,0 +1,470 @@ +import subprocess +import os +from .helper import run_mp, run_non_mp, thirdparty_binary +from ..helper import load_scp + + +def gmm_gselect_func(iteration, train_directory, config, feature_string, x): + log_path = os.path.join(train_directory, 'log', 'gselect.{}.log'.format(x)) + with open(log_path, 'w', encoding='utf8') as log_file: + subsample_feats_proc = subprocess.Popen([thirdparty_binary('subsample-feats'), + '--n=' + str(config['subsample']), + feature_string, + 'ark:-'], + stdout=subprocess.PIPE, + stderr=log_file) + + gselect_proc = subprocess.Popen([thirdparty_binary('gmm-gselect'), + '--n=' + str(config['num_gselect']), + os.path.join(train_directory, '{}.dubm'.format(iteration)), + 'ark:-', + 'ark:' + os.path.join(train_directory, 'gselect.{}'.format(x))], + stdin=subsample_feats_proc.stdout, + stderr=log_file) + gselect_proc.communicate() + + +def gmm_gselect(iteration, config, num_jobs): + """ + Multiprocessing function that stores Gaussian selection indices on disk + + See: + + - http://kaldi-asr.org/doc/gmm-gselect_8cc.html + + for more details + on the Kaldi binary this runs. + + Also see https://github.com/kaldi-asr/kaldi/blob/master/egs/wsj/s5/steps/train_diag_ubm.sh + for the original bash script that this function was based on. + + Parameters + ---------- + config : :class:`~aligner.config.DiagUbmConfig` + Configuration object for training + num_jobs : int + The number of processes to use in calculation + + """ + directory = config.train_directory + jobs = [(iteration, directory, config.ivector_options, + config.feature_config.construct_feature_proc_string(config.data_directory, directory, x), + x) for x in range(num_jobs)] + if config.use_mp: + run_mp(gmm_gselect_func, jobs, config.log_directory) + else: + run_non_mp(gmm_gselect_func, jobs, config.log_directory) + + +def acc_global_stats_func(train_directory, config, feature_string, x, iteration): + log_path = os.path.join(train_directory, 'log', 'acc.{}.{}.log'.format(iteration, x)) + with open(log_path, 'w', encoding='utf8') as log_file: + subsample_feats_proc = subprocess.Popen([thirdparty_binary('subsample-feats'), + '--n=' + str(config['subsample']), + feature_string, + 'ark:-'], + stdout=subprocess.PIPE, + stderr=log_file) + bin_name = 'gmm-global-acc-stats' + mdl_path = os.path.join(train_directory, '{}.dubm'.format(iteration)) + gmm_global_acc_proc = subprocess.Popen([thirdparty_binary(bin_name), + '--gselect=' + 'ark:' + os.path.join(train_directory, + 'gselect.{}'.format(x)), + mdl_path, + 'ark:-', + os.path.join(train_directory, '{}.{}.acc'.format(iteration, x))], + stderr=log_file, + stdin=subsample_feats_proc.stdout) + gmm_global_acc_proc.communicate() + + +def acc_global_stats(config, num_jobs, iteration): + """ + Multiprocessing function that accumulates global GMM stats + + See: + + - http://kaldi-asr.org/doc/gmm-global-acc-stats_8cc.html + + for more details + on the Kaldi binary this runs. + + Also see https://github.com/kaldi-asr/kaldi/blob/master/egs/wsj/s5/steps/train_diag_ubm.sh + for the original bash script that this function was based on. + + Parameters + ---------- + config : :class:`~aligner.config.DiagUbmConfig` + Configuration object for training + num_jobs : int + The number of processes to use in calculation + iteration : int + Iteration to calculate stats for + """ + directory = config.train_directory + jobs = [(directory, config.ivector_options, + config.feature_config.construct_feature_proc_string(config.data_directory, directory, x), + x, iteration) for x in range(num_jobs)] + if config.use_mp: + run_mp(acc_global_stats_func, jobs, config.log_directory) + else: + run_non_mp(acc_global_stats_func, jobs, config.log_directory) + + +def gauss_to_post_func(train_directory, config, feature_string, x): + modified_posterior_scale = config['posterior_scale'] * config['subsample'] + log_path = os.path.join(train_directory, 'log', 'post.{}.log'.format(x)) + with open(log_path, 'w', encoding='utf8') as log_file: + subsample_feats_proc = subprocess.Popen([thirdparty_binary('subsample-feats'), + '--n=' + str(config['subsample']), + feature_string, + 'ark:-'], + stdout=subprocess.PIPE, + stderr=log_file) + gmm_global_get_post_proc = subprocess.Popen([thirdparty_binary('gmm-global-get-post'), + '--n=' + str(config['num_gselect']), + '--min-post=' + str(config['min_post']), + os.path.join(train_directory, 'final.dubm'), + 'ark:-', + 'ark:-'], + stdout=subprocess.PIPE, + stdin=subsample_feats_proc.stdout, + stderr=log_file) + scale_post_proc = subprocess.Popen([thirdparty_binary('scale-post'), + 'ark:-', + str(modified_posterior_scale), + 'ark:' + os.path.join(train_directory, 'post.{}'.format(x))], + stdin=gmm_global_get_post_proc.stdout, + stderr=log_file) + scale_post_proc.communicate() + + +def gauss_to_post(config, num_jobs): + """ + Multiprocessing function that does Gaussian selection and posterior extraction + + See: + + - http://kaldi-asr.org/doc/gmm-global-get-post_8cc.html + - http://kaldi-asr.org/doc/scale-post_8cc.html + + for more details + on the Kaldi binary this runs. + + Also see https://github.com/kaldi-asr/kaldi/blob/master/egs/wsj/s5/steps/online/nnet2/train_ivector_extractor.sh + for the original bash script that this function was based on. + + Parameters + ---------- + config : :class:`~aligner.config.iVectorExtractorConfig` + Configuration object for training + num_jobs : int + The number of processes to use in calculation + """ + func = gauss_to_post_func + directory = config.train_directory + jobs = [(config.train_directory, config.ivector_options, + config.feature_config.construct_feature_proc_string(config.data_directory, directory, x), + x) for x in range(num_jobs)] + if config.use_mp: + run_mp(func, jobs, config.log_directory) + else: + run_non_mp(func, jobs, config.log_directory) + + +def acc_ivector_stats_func(train_directory, config, feature_string, x, iteration): + log_path = os.path.join(train_directory, 'log', 'acc.{}.{}.log'.format(iteration, x)) + with open(log_path, 'w', encoding='utf8') as log_file: + subsample_feats_proc = subprocess.Popen([thirdparty_binary('subsample-feats'), + '--n=' + str(config['subsample']), + feature_string, + 'ark:-'], + stdout=subprocess.PIPE, + stderr=log_file) + acc_stats_proc = subprocess.Popen([thirdparty_binary('ivector-extractor-acc-stats'), + '--num-threads=1', + os.path.join(train_directory, '{}.ie'.format(iteration)), + 'ark:-', + 'ark:' + os.path.join(train_directory, 'post.{}'.format(x)), + os.path.join(train_directory, 'accinit.{}.{}'.format(iteration, x))], + stdin=subsample_feats_proc.stdout, + stderr=log_file) + acc_stats_proc.communicate() + + +def acc_ivector_stats(config, num_jobs, iteration): + """ + Multiprocessing function that calculates i-vector extractor stats + + See: + + - http://kaldi-asr.org/doc/ivector-extractor-acc-stats_8cc.html + - http://kaldi-asr.org/doc/ivector-extractor-sum-accs_8cc.html + + for more details + on the Kaldi binary this runs. + + Also see https://github.com/kaldi-asr/kaldi/blob/master/egs/wsj/s5/steps/online/nnet2/train_ivector_extractor.sh + for the original bash script that this function was based on. + + Parameters + ---------- + config : :class:`~aligner.config.iVectorExtractorConfig` + Configuration object for training + num_jobs : int + The number of processes to use in calculation + iteration : int + Iteration to calculate stats for + """ + directory = config.train_directory + jobs = [(config.train_directory, config.ivector_options, + config.feature_config.construct_feature_proc_string(config.data_directory, directory, x), + x, iteration) for x in range(num_jobs)] + if config.use_mp: + run_mp(acc_ivector_stats_func, jobs, config.log_directory) + else: + run_non_mp(acc_ivector_stats_func, jobs, config.log_directory) + + accinits = [os.path.join(config.train_directory, 'accinit.{}.{}'.format(iteration, j)) for j in range(num_jobs)] + log_path = os.path.join(config.train_directory, 'log', 'sum_acc.{}.log'.format(iteration)) + with open(log_path, 'w', encoding='utf8') as log_file: + sum_accs_proc = subprocess.Popen([thirdparty_binary('ivector-extractor-sum-accs'), + '--parallel=true'] + + accinits + + [os.path.join(config.train_directory, 'acc.{}'.format(iteration))], + stderr=log_file) + + sum_accs_proc.communicate() + # clean up + for p in accinits: + os.remove(p) + + +def extract_ivectors_func(directory, split_directory, config, feature_string, sil_phones, job_id, align_directory=None): + """ + Parameters + ---------- + config : :class:`~aligner.trainers.IvectorExtractorTrainer` + Configuration object for training + job_id : int + Job identifier + """ + use_align = False + ali_path = None + if align_directory is not None: + ali_path = os.path.join(align_directory, 'ali.{}'.format(job_id)) + use_align = os.path.exists(ali_path) + + log_dir = os.path.join(directory, 'log') + os.makedirs(log_dir, exist_ok=True) + ivector_mdl = os.path.join(directory, 'final.ie') + log_path = os.path.join(directory, 'log', 'extract_ivectors.{}.log'.format(job_id)) + ivectors_path = os.path.join(directory, 'ivectors.{}'.format(job_id)) + weight_path = os.path.join(directory, 'weight.{}'.format(job_id)) + mdl_path = os.path.join(directory, 'final.mdl') + spk2utt_path = os.path.join(split_directory, 'spk2utt.{}'.format(job_id)) + + silence_weight = 0.0 + posterior_scale = 0.1 + max_count = 100 + with open(log_path, 'w', encoding='utf8') as log_file: + if use_align: + ali_to_post_proc = subprocess.Popen([thirdparty_binary('ali-to-post'), + 'ark:' + ali_path, 'ark:-'], + stderr=log_file, + stdout=subprocess.PIPE) + weight_silence_proc = subprocess.Popen([thirdparty_binary('weight-silence-post'), + str(silence_weight), + sil_phones, + mdl_path, + 'ark:-', 'ark:-'], + stderr=log_file, + stdin=ali_to_post_proc.stdout, + stdout=subprocess.PIPE) + post_to_weight_proc = subprocess.Popen([thirdparty_binary('post-to-weights'), + 'ark:-', 'ark:' + weight_path], + stderr=log_file, + stdin=weight_silence_proc.stdout) + post_to_weight_proc.communicate() + + gmm_global_get_post_proc = subprocess.Popen([thirdparty_binary('gmm-global-get-post'), + '--n=' + str(config['num_gselect']), + '--min-post=' + str(config['min_post']), + os.path.join(directory, 'final.dubm'), + feature_string, + 'ark:-'], + stdout=subprocess.PIPE, + stderr=log_file) + if use_align: + weight_proc = subprocess.Popen([thirdparty_binary('weight-post'), + 'ark:-', 'ark,s,cs:' + weight_path, 'ark:-'], + stdin=gmm_global_get_post_proc.stdout, + stdout=subprocess.PIPE, stderr=log_file) + extract_in = weight_proc.stdout + else: + extract_in = gmm_global_get_post_proc.stdout + extract_proc = subprocess.Popen([thirdparty_binary('ivector-extract'), + '--acoustic-weight={}'.format(posterior_scale), + '--compute-objf-change=true', + '--max-count={}'.format(max_count), + ivector_mdl, + feature_string, + 'ark,s,cs:-', + 'ark,t:' + ivectors_path], + stderr=log_file, + stdin=extract_in) + extract_proc.communicate() + + +def extract_ivectors(directory, split_directory, config, num_jobs, align_directory=None): + """ + Multiprocessing function that extracts i-vectors. + + See: + + - http://kaldi-asr.org/doc/ivector-extract-online2_8cc.html + - http://kaldi-asr.org/doc/copy-feats_8cc.html + + for more details + on the Kaldi binary this runs. + + Also see https://github.com/kaldi-asr/kaldi/blob/master/egs/wsj/s5/steps/online/nnet2/extract_ivectors_online.sh + for the original bash script that this function was based on. + + Parameters + ---------- + config : :class:`~montreal_forced_aligner.config.iVectorExtractorConfig` + Configuration object for training + num_jobs : int + The number of processes to use in calculation + """ + + log_dir = os.path.join(directory, 'log') + os.makedirs(log_dir, exist_ok=True) + data_directory = split_directory + func = extract_ivectors_func + try: + csl = config.dictionary.silence_csl + except AttributeError: + csl = None + jobs = [(directory, config.corpus.split_directory(), config.ivector_options, + config.feature_config.construct_feature_proc_string(data_directory, directory, x), + csl, + x, align_directory) for x in range(num_jobs)] + if config.use_mp: + run_mp(func, jobs, log_dir) + else: + run_non_mp(func, jobs, log_dir) + + +def get_initial_segmentation(frames, frame_shift): + segs = [] + cur_seg = None + silent_frames = 0 + non_silent_frames = 0 + for i, f in enumerate(frames): + if int(f) > 0: + non_silent_frames += 1 + if cur_seg is None: + cur_seg = {'begin': i * frame_shift} + else: + silent_frames += 1 + if cur_seg is not None: + cur_seg['end'] = (i - 1) * frame_shift + segs.append(cur_seg) + cur_seg = None + if cur_seg is not None: + cur_seg['end'] = len(frames) * frame_shift + segs.append(cur_seg) + total = non_silent_frames + silent_frames + return segs + + +def merge_segments(segments, min_pause_duration, max_segment_length): + merged_segs = [] + for s in segments: + if not merged_segs or s['begin'] > merged_segs[-1]['end'] + min_pause_duration or \ + s['end'] - merged_segs[-1]['begin'] > max_segment_length: + if s['end'] - s['begin'] > min_pause_duration: + merged_segs.append(s) + else: + merged_segs[-1]['end'] = s['end'] + return merged_segs + + +def segment_vad_func(directory, job_name, config): + vad_path = os.path.join(directory, 'vad.{}.scp'.format(job_name)) + vad_segments_path = os.path.join(directory, 'vad_segments.{}.scp'.format(job_name)) + + vad = load_scp(vad_path, data_type=int) + with open(vad_segments_path, 'w', encoding='utf8') as out_file: + for recording, frames in vad.items(): + initial_segments = get_initial_segmentation(frames, config['frame_shift']) + merged = merge_segments(initial_segments, config['min_pause_duration'], config['max_segment_length']) + for seg in merged: + start = seg['begin'] + end = seg['end'] + new_utt = "{utt_id}-{s:08d}-{e:08d}".format( + utt_id=recording, s=int(round(100 * start)), + e=int(round(100 * end))) + out_file.write("{utt_id} {recording} {s:.3f} {e:.3f}\n".format(utt_id=new_utt, recording=recording, + s=start, e=end)) + + +def segment_vad(corpus, config): + split_dir = corpus.split_directory() + log_directory = os.path.join(split_dir, 'log') + num_jobs = corpus.num_jobs + jobs = [(split_dir, x, config.segmentation_options) for x in range(num_jobs)] + if config.use_mp: + run_mp(segment_vad_func, jobs, log_directory) + else: + run_non_mp(segment_vad_func, jobs, log_directory) + + +def classify_speakers_func(directory, job_name): + from ..helper import load_scp, save_scp + from joblib import load + import numpy as np + import warnings + from collections import defaultdict + mdl_path = os.path.join(directory, 'speaker_classifier.mdl') + labels_path = os.path.join(directory, 'speaker_labels.txt') + speakers = {} + with open(labels_path, 'r', encoding='utf8') as f: + for line in f: + line = line.strip().split() + speaker, speak_ind = line + speakers[int(speak_ind)] = speaker + ivectors_path = os.path.join(directory, 'ivectors.{}'.format(job_name)) + spk2utt_path = os.path.join(directory, 'spk2utt.{}'.format(job_name)) + utt2spk_path = os.path.join(directory, 'utt2spk.{}'.format(job_name)) + ivec = load_scp(ivectors_path) + x = [] + for utt, ivector in ivec.items(): + ivector = [float(x) for x in ivector] + x.append(ivector) + x = np.array(x) + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + clf = load(mdl_path) + y = clf.predict(x) + speak_utt_mapping = defaultdict(list) + utt_speak_mapping = {} + for i, utt in enumerate(ivec.keys()): + speak_ind = y[i] + speaker = speakers[speak_ind] + speak_utt_mapping[speaker].append(utt) + utt_speak_mapping[utt] = speaker + save_scp(([k, v] for k,v in speak_utt_mapping.items()), spk2utt_path) + save_scp(([k, v] for k,v in utt_speak_mapping.items()), utt2spk_path) + + +def classify_speakers(directory, config, num_jobs): + log_directory = os.path.join(directory, 'log') + jobs = [(directory, x) for x in range(num_jobs)] + + if config.use_mp: + run_mp(classify_speakers_func, jobs, log_directory) + else: + run_non_mp(classify_speakers_func, jobs, log_directory) \ No newline at end of file diff --git a/montreal_forced_aligner/multiprocessing/pronunciations.py b/montreal_forced_aligner/multiprocessing/pronunciations.py new file mode 100644 index 00000000..1bfbc5c9 --- /dev/null +++ b/montreal_forced_aligner/multiprocessing/pronunciations.py @@ -0,0 +1,87 @@ +import subprocess +import os +import shutil +import re +import sys +import traceback +import time +from decimal import Decimal +import statistics + +from .helper import make_path_safe, run_mp, run_non_mp, thirdparty_binary, parse_logs + + + +def generate_pronunciations_func(model_directory, dictionary, corpus, job_name): + text_int_path = os.path.join(corpus.split_directory(), 'text.{}.int'.format(job_name)) + log_path = os.path.join(model_directory, 'log', 'pronunciation.{}.log'.format(job_name)) + ali_path = os.path.join(model_directory, 'ali.{}'.format(job_name)) + model_path = os.path.join(model_directory, 'final.mdl') + aligned_path = os.path.join(model_directory, 'aligned.{}'.format(job_name)) + nbest_path = os.path.join(model_directory, 'nbest.{}'.format(job_name)) + pron_path = os.path.join(model_directory, 'prons.{}'.format(job_name)) + with open(log_path, 'w', encoding='utf8') as log_file: + lin_proc = subprocess.Popen([thirdparty_binary('linear-to-nbest'), "ark:" + ali_path, + "ark:" + text_int_path, + '', '', 'ark,t:' + nbest_path], + stdout=subprocess.PIPE, stderr=log_file) + + lin_proc.communicate() + lin_proc = subprocess.Popen([thirdparty_binary('linear-to-nbest'), "ark:" + ali_path, + "ark:" + text_int_path, + '', '', 'ark:-'], + stdout=subprocess.PIPE, stderr=log_file) + align_proc = subprocess.Popen([thirdparty_binary('lattice-align-words'), + os.path.join(dictionary.phones_dir, 'word_boundary.int'), model_path, + 'ark:-', 'ark,t:' + aligned_path], + stdin=lin_proc.stdout, stderr=log_file) + align_proc.communicate() + + subprocess.call([thirdparty_binary('nbest-to-prons'), + model_path, + 'ark:' + aligned_path, + pron_path], + stderr=log_file) + + +def generate_pronunciations(align_config, model_directory, dictionary, corpus, num_jobs): + from collections import Counter, defaultdict + log_directory = os.path.join(model_directory, 'log') + os.makedirs(log_directory, exist_ok=True) + jobs = [(model_directory, dictionary, corpus, x) + for x in range(num_jobs)] + if align_config.use_mp: + run_mp(generate_pronunciations_func, jobs, log_directory) + else: + run_non_mp(generate_pronunciations_func, jobs, log_directory) + + word_lookup = dictionary.reversed_word_mapping + phone_lookup = dictionary.reversed_phone_mapping + pron_counts = defaultdict(Counter) + for j in range(num_jobs): + pron_path = os.path.join(model_directory, 'prons.{}'.format(j)) + with open(pron_path, 'r', encoding='utf8') as f: + utt_mapping = {} + last_utt = None + for line in f: + line = line.split() + utt = line[0] + if utt not in utt_mapping: + if last_utt is not None: + utt_mapping[last_utt].append('') + utt_mapping[utt] = [''] + last_utt = utt + + begin = line[1] + end = line[2] + word = word_lookup[int(line[3])] + if word == '': + utt_mapping[utt].append(word) + else: + pron = tuple(phone_lookup[int(x)].split('_')[0] for x in line[4:]) + pron_string = ' '.join(pron) + utt_mapping[utt].append(word + ' ' + pron_string) + pron_counts[word][pron] += 1 + print(word, pron) + return pron_counts, utt_mapping + diff --git a/montreal_forced_aligner/multiprocessing/transcription.py b/montreal_forced_aligner/multiprocessing/transcription.py new file mode 100644 index 00000000..c6d8e54c --- /dev/null +++ b/montreal_forced_aligner/multiprocessing/transcription.py @@ -0,0 +1,332 @@ +import subprocess +import os +import shutil + +from .helper import run_mp, run_non_mp, thirdparty_binary + + +def decode_func(directory, job_name, mdl, config, feat_string, output_directory, num_threads=None): + log_path = os.path.join(output_directory, 'log', 'decode.{}.log'.format(job_name)) + lat_path = os.path.join(output_directory, 'lat.{}'.format(job_name)) + if os.path.exists(lat_path): + return + word_symbol_path = os.path.join(directory, 'words.txt') + hclg_path = os.path.join(directory, 'HCLG.fst') + if config.fmllr and config.first_beam is not None: + beam = config.first_beam + else: + beam = config.beam + if config.fmllr and config.first_max_active is not None: + max_active = config.first_max_active + else: + max_active = config.max_active + with open(log_path, 'w', encoding='utf8') as log_file: + if num_threads is None: + decode_proc = subprocess.Popen([thirdparty_binary('gmm-latgen-faster'), + '--max-active={}'.format(max_active), + '--beam={}'.format(beam), + '--lattice-beam={}'.format(config.lattice_beam), + '--allow-partial=true', + '--word-symbol-table={}'.format(word_symbol_path), + '--acoustic-scale={}'.format(config.acoustic_scale), + mdl, hclg_path, feat_string, + "ark:" + lat_path], + stderr=log_file) + else: + decode_proc = subprocess.Popen([thirdparty_binary('gmm-latgen-faster-parallel'), + '--max-active={}'.format(max_active), + '--beam={}'.format(beam), + '--lattice-beam={}'.format(config.lattice_beam), + '--allow-partial=true', + '--word-symbol-table={}'.format(word_symbol_path), + '--acoustic-scale={}'.format(config.acoustic_scale), + '--num-threads={}'.format(num_threads), + mdl, hclg_path, feat_string, + "ark:" + lat_path], + stderr=log_file) + decode_proc.communicate() + + +def score_func(directory, job_name, config, output_directory, language_model_weight=None, word_insertion_penalty=None): + lat_path = os.path.join(directory, 'lat.{}'.format(job_name)) + words_path = os.path.join(directory, 'words.txt') + tra_path = os.path.join(output_directory, 'tra.{}'.format(job_name)) + log_path = os.path.join(output_directory, 'log', 'score.{}.log'.format(job_name)) + if language_model_weight is None: + language_model_weight = config.language_model_weight + if word_insertion_penalty is None: + word_insertion_penalty = config.word_insertion_penalty + with open(log_path, 'w', encoding='utf8') as log_file: + scale_proc = subprocess.Popen([thirdparty_binary('lattice-scale'), + '--inv-acoustic-scale={}'.format(language_model_weight), + 'ark:' + lat_path, 'ark:-' + ], stdout=subprocess.PIPE, stderr=log_file) + penalty_proc = subprocess.Popen([thirdparty_binary('lattice-add-penalty'), + '--word-ins-penalty={}'.format(word_insertion_penalty), + 'ark:-', 'ark:-'], + stdin=scale_proc.stdout, stdout=subprocess.PIPE, stderr=log_file) + best_path_proc = subprocess.Popen([thirdparty_binary('lattice-best-path'), + '--word-symbol-table={}'.format(words_path), + 'ark:-', 'ark,t:' + tra_path], stdin=penalty_proc.stdout, stderr=log_file) + best_path_proc.communicate() + + +def transcribe(transcriber): + """ + """ + directory = transcriber.transcribe_directory + output_directory = transcriber.transcribe_directory + log_directory = os.path.join(output_directory, 'log') + config = transcriber.transcribe_config + mdl_path = os.path.join(directory, 'final.mdl') + corpus = transcriber.corpus + num_jobs = corpus.num_jobs + + if config.use_mp and num_jobs > 1: + jobs = [(directory, x, mdl_path, config, + config.feature_config.construct_feature_proc_string(corpus.split_directory(), directory, x), + output_directory) + for x in range(num_jobs)] + else: + jobs = [(directory, x, mdl_path, config, + config.feature_config.construct_feature_proc_string(corpus.split_directory(), directory, x), + output_directory, corpus.original_num_jobs) + for x in range(num_jobs)] + + if config.use_mp and num_jobs > 1: + run_mp(decode_func, jobs, log_directory) + else: + run_non_mp(decode_func, jobs, log_directory) + + if transcriber.evaluation_mode: + best_wer = 10000 + best = None + for lmwt in range(transcriber.min_language_model_weight, transcriber.max_language_model_weight): + for wip in transcriber.word_insertion_penalties: + out_dir = os.path.join(output_directory, 'eval_{}_{}'.format(lmwt, wip)) + log_dir = os.path.join(out_dir, 'log') + os.makedirs(log_dir, exist_ok=True) + jobs = [(directory, x, config, out_dir, lmwt, wip) + for x in range(num_jobs)] + if config.use_mp: + run_mp(score_func, jobs, log_dir) + else: + run_non_mp(score_func, jobs, log_dir) + ser, wer = transcriber.evaluate(out_dir, out_dir) + if wer < best_wer: + best = (lmwt, wip) + transcriber.transcribe_config.language_model_weight = best[0] + transcriber.transcribe_config.word_insertion_penalty = best[1] + else: + jobs = [(directory, x, config, output_directory) + for x in range(num_jobs)] + if config.use_mp: + run_mp(score_func, jobs, log_directory) + else: + run_non_mp(score_func, jobs, log_directory) + + +def initial_fmllr_func(directory, split_directory, sil_phones, job_name, mdl, config, feat_string, output_directory, + num_threads=None): + + log_path = os.path.join(output_directory, 'log', 'initial_fmllr.{}.log'.format(job_name)) + pre_trans_path = os.path.join(output_directory, 'pre_trans.{}'.format(job_name)) + lat_path = os.path.join(directory, 'lat.{}'.format(job_name)) + spk2utt_path = os.path.join(split_directory, 'spk2utt.{}'.format(job_name)) + + with open(log_path, 'w', encoding='utf8') as log_file: + latt_post_proc = subprocess.Popen([thirdparty_binary('lattice-to-post'), + '--acoustic-scale={}'.format(config.acoustic_scale), + 'ark:' + lat_path, 'ark:-'], stdout=subprocess.PIPE, + stderr=log_file) + weight_silence_proc = subprocess.Popen([thirdparty_binary('weight-silence-post'), + str(config.silence_weight), + sil_phones, mdl, 'ark:-', 'ark:-'], + stdin=latt_post_proc.stdout, stdout=subprocess.PIPE, + stderr=log_file) + gmm_gpost_proc = subprocess.Popen([thirdparty_binary('gmm-post-to-gpost'), + mdl, feat_string, 'ark:-', 'ark:-'], + stdin=weight_silence_proc.stdout, stdout=subprocess.PIPE, + stderr=log_file) + fmllr_proc = subprocess.Popen([thirdparty_binary('gmm-est-fmllr-gpost'), + '--fmllr-update-type={}'.format(config.fmllr_update_type), + '--spk2utt=ark:' + spk2utt_path, mdl, feat_string, + 'ark,s,cs:-', 'ark:' + pre_trans_path], + stdin=gmm_gpost_proc.stdout, stdout=subprocess.PIPE, stderr=log_file) + fmllr_proc.communicate() + + +def lat_gen_fmllr_func(directory, split_directory, sil_phones, job_name, mdl, config, feat_string, output_directory, + num_threads=None): + log_path = os.path.join(output_directory, 'log', 'lat_gen.{}.log'.format(job_name)) + word_symbol_path = os.path.join(directory, 'words.txt') + hclg_path = os.path.join(directory, 'HCLG.fst') + tmp_lat_path = os.path.join(output_directory, 'lat.tmp.{}'.format(job_name)) + with open(log_path, 'w', encoding='utf8') as log_file: + if num_threads is None: + lat_gen_proc = subprocess.Popen([thirdparty_binary('gmm-latgen-faster'), + '--max-active={}'.format(config.max_active), + '--beam={}'.format(config.beam), + '--lattice-beam={}'.format(config.lattice_beam), + '--acoustic-scale={}'.format(config.acoustic_scale), + '--determinize-lattice=false', + '--allow-partial=true', + '--word-symbol-table={}'.format(word_symbol_path), + mdl, hclg_path, feat_string, 'ark:' + tmp_lat_path + ], stderr=log_file) + else: + lat_gen_proc = subprocess.Popen([thirdparty_binary('gmm-latgen-faster-parallel'), + '--max-active={}'.format(config.max_active), + '--beam={}'.format(config.beam), + '--lattice-beam={}'.format(config.lattice_beam), + '--acoustic-scale={}'.format(config.acoustic_scale), + '--determinize-lattice=false', + '--allow-partial=true', + '--num-threads={}'.format(num_threads), + '--word-symbol-table={}'.format(word_symbol_path), + mdl, hclg_path, feat_string, 'ark:' + tmp_lat_path + ], stderr=log_file) + lat_gen_proc.communicate() + + +def final_fmllr_est_func(directory, split_directory, sil_phones, job_name, mdl, config, feat_string, output_directory, + num_threads=None): + log_path = os.path.join(output_directory, 'log', 'final_fmllr.{}.log'.format(job_name)) + pre_trans_path = os.path.join(output_directory, 'pre_trans.{}'.format(job_name)) + trans_tmp_path = os.path.join(output_directory, 'trans_tmp.{}'.format(job_name)) + trans_path = os.path.join(output_directory, 'trans.{}'.format(job_name)) + lat_path = os.path.join(directory, 'lat.{}'.format(job_name)) + spk2utt_path = os.path.join(split_directory, 'spk2utt.{}'.format(job_name)) + tmp_lat_path = os.path.join(output_directory, 'lat.tmp.{}'.format(job_name)) + with open(log_path, 'w', encoding='utf8') as log_file: + if num_threads is None: + determinize_proc = subprocess.Popen([thirdparty_binary('lattice-determinize-pruned'), + '--acoustic-scale={}'.format(config.acoustic_scale), + '--beam=4.0', 'ark:' + tmp_lat_path, 'ark:-'], + stderr=log_file, stdout=subprocess.PIPE) + else: + determinize_proc = subprocess.Popen([thirdparty_binary('lattice-determinize-pruned-parallel'), + '--acoustic-scale={}'.format(config.acoustic_scale), + '--num-threads={}'.format(num_threads), + '--beam=4.0', 'ark:' + tmp_lat_path, 'ark:-'], + stderr=log_file, stdout=subprocess.PIPE) + latt_post_proc = subprocess.Popen([thirdparty_binary('lattice-to-post'), + '--acoustic-scale={}'.format(config.acoustic_scale), + 'ark:' + lat_path, 'ark:-'], + stdin=determinize_proc.stdout, stdout=subprocess.PIPE, stderr=log_file) + weight_silence_proc = subprocess.Popen([thirdparty_binary('weight-silence-post'), + str(config.silence_weight), + sil_phones, mdl, 'ark:-', 'ark:-'], + stdin=latt_post_proc.stdout, stdout=subprocess.PIPE, + stderr=log_file) + fmllr_proc = subprocess.Popen([thirdparty_binary('gmm-est-fmllr'), + '--fmllr-update-type={}'.format(config.fmllr_update_type), + '--spk2utt=ark:' + spk2utt_path, mdl, feat_string, + 'ark,s,cs:-', 'ark:' + trans_tmp_path], + stdin=weight_silence_proc.stdout, stdout=subprocess.PIPE, stderr=log_file) + fmllr_proc.communicate() + + compose_proc = subprocess.Popen([thirdparty_binary('compose-transforms'), + '--b-is-affine=true', 'ark:' + trans_tmp_path, + 'ark:' + pre_trans_path, 'ark:' + trans_path], + stderr=log_file) + compose_proc.communicate() + + +def fmllr_rescore_func(directory, split_directory, sil_phones, job_name, mdl, config, feat_string, output_directory, + num_threads=None): + log_path = os.path.join(output_directory, 'log', 'fmllr_rescore.{}.log'.format(job_name)) + tmp_lat_path = os.path.join(output_directory, 'lat.tmp.{}'.format(job_name)) + final_lat_path = os.path.join(output_directory, 'lat.{}'.format(job_name)) + with open(log_path, 'w', encoding='utf8') as log_file: + rescore_proc = subprocess.Popen([thirdparty_binary('gmm-rescore-lattice'), + mdl, 'ark:' + tmp_lat_path, + feat_string, 'ark:-'], + stdout=subprocess.PIPE, stderr=log_file) + if num_threads is None: + determinize_proc = subprocess.Popen([thirdparty_binary('lattice-determinize-pruned'), + '--acoustic-scale={}'.format(config.acoustic_scale), + '--beam={}'.format(config.lattice_beam), + 'ark:-', 'ark:' + final_lat_path + ], stdin=rescore_proc.stdout, stderr=log_file) + else: + determinize_proc = subprocess.Popen([thirdparty_binary('lattice-determinize-pruned-parallel'), + '--acoustic-scale={}'.format(config.acoustic_scale), + '--beam={}'.format(config.lattice_beam), + '--num-threads={}'.format(num_threads), + 'ark:-', 'ark:' + final_lat_path + ], stdin=rescore_proc.stdout, stderr=log_file) + determinize_proc.communicate() + + +def transcribe_fmllr(transcriber): + directory = transcriber.transcribe_directory + output_directory = transcriber.transcribe_directory + config = transcriber.transcribe_config + corpus = transcriber.corpus + num_jobs = corpus.num_jobs + split_directory = corpus.split_directory() + sil_phones = transcriber.dictionary.optional_silence_csl + + fmllr_directory = os.path.join(output_directory, 'fmllr') + log_dir = os.path.join(fmllr_directory, 'log') + os.makedirs(log_dir, exist_ok=True) + mdl_path = os.path.join(directory, 'final.mdl') + feat_name = config.feature_file_base_name + feat_name += '.{}.scp' + jobs = [] + for x in range(num_jobs): + if num_jobs > 1: + jobs = [(directory, split_directory, sil_phones, x, mdl_path, config, + config.feature_config.construct_feature_proc_string(split_directory, directory, x), fmllr_directory) + for x in range(num_jobs)] + else: + jobs = [(directory, split_directory, sil_phones, x, mdl_path, config, + config.feature_config.construct_feature_proc_string(split_directory, directory, x), fmllr_directory, corpus.original_num_jobs) + for x in range(num_jobs)] + + run_non_mp(initial_fmllr_func, jobs, log_dir) + + if config.use_mp and num_jobs > 1: + run_mp(lat_gen_fmllr_func, jobs, log_dir) + else: + run_non_mp(lat_gen_fmllr_func, jobs, log_dir) + + run_non_mp(final_fmllr_est_func, jobs, log_dir) + + if config.use_mp: + run_mp(fmllr_rescore_func, jobs, log_dir) + else: + run_non_mp(fmllr_rescore_func, jobs, log_dir) + + if transcriber.evaluation_mode: + best_wer = 10000 + best = None + for lmwt in range(transcriber.min_language_model_weight, transcriber.max_language_model_weight): + for wip in transcriber.word_insertion_penalties: + out_dir = os.path.join(fmllr_directory, 'eval_{}_{}'.format(lmwt, wip)) + log_dir = os.path.join(out_dir, 'log') + os.makedirs(log_dir, exist_ok=True) + jobs = [(directory, x, config, out_dir, lmwt, wip) + for x in range(num_jobs)] + if config.use_mp: + run_mp(score_func, jobs, log_dir) + else: + run_non_mp(score_func, jobs, log_dir) + ser, wer = transcriber.evaluate(out_dir, out_dir) + if wer < best_wer: + best = (lmwt, wip) + transcriber.transcribe_config.language_model_weight = best[0] + transcriber.transcribe_config.word_insertion_penalty = best[1] + out_dir = os.path.join(fmllr_directory, 'eval_{}_{}'.format(best[0], best[1])) + for j in range(num_jobs): + tra_path = os.path.join(out_dir, 'tra.{}'.format(j)) + saved_tra_path = os.path.join(fmllr_directory, 'tra.{}'.format(j)) + shutil.copyfile(tra_path, saved_tra_path) + else: + jobs = [(directory, x, config, fmllr_directory) + for x in range(num_jobs)] + if config.use_mp: + run_mp(score_func, jobs, log_dir) + else: + run_non_mp(score_func, jobs, log_dir) \ No newline at end of file diff --git a/montreal_forced_aligner/segmenter.py b/montreal_forced_aligner/segmenter.py new file mode 100644 index 00000000..c6bf2b79 --- /dev/null +++ b/montreal_forced_aligner/segmenter.py @@ -0,0 +1,137 @@ +import os +import shutil +from .config import TEMP_DIR +from .helper import log_kaldi_errors, parse_logs +from .exceptions import KaldiProcessingError + + +class Segmenter(object): + """ + Class for performing speaker classification + + Parameters + ---------- + corpus : :class:`~montreal_forced_aligner.corpus.TranscribeCorpus` + Corpus object for the dataset + segmentation_config : :class:`~montreal_forced_aligner.config.SegmentationConfig` + Configuration for alignment + temp_directory : str, optional + Specifies the temporary directory root to save files need for Kaldi. + If not specified, it will be set to ``~/Documents/MFA`` + call_back : callable, optional + Specifies a call back function for segmentation + debug : bool + Flag for running in debug mode, defaults to false + verbose : bool + Flag for running in verbose mode, defaults to false + """ + def __init__(self, corpus, segmentation_config, + temp_directory=None, call_back=None, debug=False, verbose=False, logger=None): + self.corpus = corpus + self.segmentation_config = segmentation_config + + if not temp_directory: + temp_directory = TEMP_DIR + self.temp_directory = temp_directory + self.call_back = call_back + if self.call_back is None: + self.call_back = print + self.debug = debug + self.verbose = verbose + self.logger = logger + self.setup() + + @property + def segmenter_directory(self): + return os.path.join(self.temp_directory, 'segmentation') + + @property + def vad_options(self): + return {'energy_threshold': self.segmentation_config.energy_threshold, + 'energy_mean_scale': self.segmentation_config.energy_mean_scale} + + @property + def segmentation_options(self): + return {'max_segment_length': self.segmentation_config.max_segment_length, + 'min_pause_duration': self.segmentation_config.min_pause_duration, + 'frame_shift': round(self.segmentation_config.feature_config.frame_shift / 1000, 2)} + + @property + def use_mp(self): + return self.segmentation_config.use_mp + + def setup(self): + done_path = os.path.join(self.segmenter_directory, 'done') + if os.path.exists(done_path): + self.logger.info('Classification already done, skipping initialization.') + return + dirty_path = os.path.join(self.segmenter_directory, 'dirty') + if os.path.exists(dirty_path): + shutil.rmtree(self.segmenter_directory) + log_dir = os.path.join(self.segmenter_directory, 'log') + os.makedirs(log_dir, exist_ok=True) + try: + self.corpus.initialize_corpus() + fc = self.segmentation_config.feature_config + fc.generate_features(self.corpus, logger=self.logger, cmvn=False) + except Exception as e: + with open(dirty_path, 'w'): + pass + if isinstance(e, KaldiProcessingError): + log_kaldi_errors(e.error_logs, self.logger) + raise + + def segment(self): + log_directory = os.path.join(self.segmenter_directory, 'log') + dirty_path = os.path.join(self.segmenter_directory, 'dirty') + done_path = os.path.join(self.segmenter_directory, 'done') + if os.path.exists(done_path): + self.logger.info('Classification already done, skipping.') + return + try: + fc = self.segmentation_config.feature_config + fc.compute_vad(self.corpus, logger=self.logger, vad_config=self.vad_options) + self.corpus.create_vad_segments(self) + parse_logs(log_directory) + except Exception as e: + with open(dirty_path, 'w'): + pass + if isinstance(e, KaldiProcessingError): + log_kaldi_errors(e.error_logs, self.logger) + raise + with open(done_path, 'w'): + pass + + def export_segments(self, output_directory): + from decimal import Decimal + from textgrid import TextGrid, IntervalTier + + file_dict = {} + for utt, segment in self.corpus.vad_segments.items(): + filename, utt_begin, utt_end = segment + utt_begin = Decimal(utt_begin) + utt_end = Decimal(utt_end) + if filename not in file_dict: + file_dict[filename] = {} + speaker = 'segments' + text = 'speech' + if speaker not in file_dict[filename]: + file_dict[filename][speaker] = [] + file_dict[filename][speaker].append([utt_begin, utt_end, text]) + for filename, speaker_dict in file_dict.items(): + try: + speaker_directory = os.path.join(output_directory, self.corpus.file_directory_mapping[filename]) + except KeyError: + speaker_directory = output_directory + os.makedirs(speaker_directory, exist_ok=True) + max_time = self.corpus.get_wav_duration(filename) + tg = TextGrid(maxTime=max_time) + for speaker in sorted(speaker_dict.keys()): + words = speaker_dict[speaker] + tier = IntervalTier(name=speaker, maxTime=max_time) + for w in words: + if w[1] > max_time: + w[1] = max_time + tier.add(*w) + tg.append(tier) + tg.write(os.path.join(speaker_directory, filename + '.TextGrid')) \ No newline at end of file diff --git a/montreal_forced_aligner/speaker_classifier.py b/montreal_forced_aligner/speaker_classifier.py new file mode 100644 index 00000000..22ea4d78 --- /dev/null +++ b/montreal_forced_aligner/speaker_classifier.py @@ -0,0 +1,278 @@ +import os +import shutil +import subprocess +from joblib import load +import numpy as np +import time +from .config import TEMP_DIR +from .helper import thirdparty_binary, make_path_safe, log_kaldi_errors, parse_logs +from .exceptions import KaldiProcessingError + +from .multiprocessing import extract_ivectors, classify_speakers + +from .helper import load_scp, save_scp + + +class SpeakerClassifier(object): + """ + Class for performing speaker classification + + Parameters + ---------- + corpus : :class:`~montreal_forced_aligner.corpus.TranscribeCorpus` + Corpus object for the dataset + ivector_extractor : :class:`~montreal_forced_aligner.models.IvectorExtractor` + Configuration for alignment + classification_config : :class:`~montreal_forced_aligner.config.SpeakerClassificationConfig` + Configuration for alignment + temp_directory : str, optional + Specifies the temporary directory root to save files need for Kaldi. + If not specified, it will be set to ``~/Documents/MFA`` + call_back : callable, optional + Specifies a call back function for diarization + debug : bool + Flag for running in debug mode, defaults to false + verbose : bool + Flag for running in verbose mode, defaults to false + """ + def __init__(self, corpus, ivector_extractor, classification_config, compute_segments=False, + num_speakers = None, cluster=False, + temp_directory=None, call_back=None, debug=False, verbose=False, logger=None): + self.corpus = corpus + self.ivector_extractor = ivector_extractor + self.feature_config = self.ivector_extractor.feature_config + self.classification_config = classification_config + + if not temp_directory: + temp_directory = TEMP_DIR + self.temp_directory = temp_directory + self.call_back = call_back + if self.call_back is None: + self.call_back = print + self.debug = debug + self.compute_segments = compute_segments + self.verbose = verbose + self.logger = logger + self.classifier = None + self.speaker_labels = {} + self.ivectors = {} + self.num_speakers = num_speakers + self.cluster = cluster + self.setup() + + @property + def classify_directory(self): + return os.path.join(self.temp_directory, 'speaker_classification') + + @property + def ivector_options(self): + return self.ivector_extractor.meta + + @property + def use_mp(self): + return self.classification_config.use_mp + + def setup(self): + done_path = os.path.join(self.classify_directory, 'done') + if os.path.exists(done_path): + self.logger.info('Classification already done, skipping initialization.') + return + dirty_path = os.path.join(self.classify_directory, 'dirty') + if os.path.exists(dirty_path): + shutil.rmtree(self.classify_directory) + log_dir = os.path.join(self.classify_directory, 'log') + os.makedirs(log_dir, exist_ok=True) + self.ivector_extractor.export_model(self.classify_directory) + try: + self.corpus.initialize_corpus() + self.feature_config.generate_features(self.corpus, logger=self.logger, cmvn=False) + extract_ivectors(self.classify_directory, self.corpus.split_directory(), self, self.corpus.num_jobs) + except Exception as e: + with open(dirty_path, 'w'): + pass + if isinstance(e, KaldiProcessingError): + log_kaldi_errors(e.error_logs, self.logger) + raise + + def classify(self): + log_directory = os.path.join(self.classify_directory, 'log') + dirty_path = os.path.join(self.classify_directory, 'dirty') + done_path = os.path.join(self.classify_directory, 'done') + if os.path.exists(done_path): + self.logger.info('Classification already done, skipping.') + return + try: + if not self.cluster: + classify_speakers(self.classify_directory, self, self.corpus.num_jobs) + parse_logs(log_directory) + except Exception as e: + with open(dirty_path, 'w'): + pass + if isinstance(e, KaldiProcessingError): + log_kaldi_errors(e.error_logs, self.logger) + raise + with open(done_path, 'w'): + pass + + def load_ivectors(self): + self.ivectors = {} + for j in range(self.corpus.num_jobs): + ivectors_path = os.path.join(self.classify_directory, 'ivectors.{}'.format(j)) + ivec = load_scp(ivectors_path) + for utt, ivector in ivec.items(): + ivector = [float(x) for x in ivector] + self.ivectors[utt] = ivector + + def load_classifier(self): + import warnings + mdl_path = os.path.join(self.classify_directory, 'speaker_classifier.mdl') + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + self.classifier = load(mdl_path) + + labels_path = os.path.join(self.classify_directory, 'speaker_labels.txt') + with open(labels_path, 'r', encoding='utf8') as f: + for line in f: + line = line.strip().split() + speaker, speak_ind = line + self.speaker_labels[int(speak_ind)] = speaker + speakers = {} + with open(labels_path, 'r', encoding='utf8') as f: + for line in f: + line = line.strip().split() + speaker, speak_ind = line + speakers[int(speak_ind)] = speaker + + def cluster_utterances(self): + from sklearn.cluster import KMeans + if not self.ivectors: + self.load_ivectors() + x = [] + for k, v in self.ivectors.items(): + x.append(v) + x = np.array(x) + clust = KMeans(self.num_speakers).fit(x) + y = clust.labels_ + spk2utt_path = os.path.join(self.classify_directory, 'spk2utt') + utt2spk_path = os.path.join(self.classify_directory, 'utt2spk') + utt2spk = {} + spk2utt = {} + for i, u in enumerate(self.ivectors.keys()): + speaker = y[i] + utt2spk[u] = speaker + if speaker not in spk2utt: + spk2utt[speaker] = [] + spk2utt[speaker].append(speaker) + save_scp(([k, v] for k,v in spk2utt.items()), spk2utt_path) + save_scp(([k, v] for k,v in utt2spk.items()), utt2spk_path) + + def classify_utterances(self, utterances, valid_speakers=None): + if not self.classifier: + self.load_classifier() + if not self.ivectors: + self.load_ivectors() + x = [] + for u in utterances: + x.append(self.ivectors[u]) + x = np.array(x) + y = self.classifier.predict_proba(x) + if valid_speakers: + for i in range(y.shape[1]): + if self.speaker_labels[i] not in valid_speakers: + y[:,i] = 0 + output = {} + inds = y.argmax(axis=1) + for i, u in enumerate(utterances): + output[u] = self.speaker_labels[inds[i]] + return output + + def get_classification_stats(self): + begin = time.time() + from collections import Counter + counts = Counter() + utt2spk = {} + spk2utt = {} + for j in range(self.corpus.num_jobs): + utt2spk_path = os.path.join(self.classify_directory, 'utt2spk.{}'.format(j)) + utt2spk.update(load_scp(utt2spk_path)) + for j in range(self.corpus.num_jobs): + spk2utt_path = os.path.join(self.classify_directory, 'spk2utt.{}'.format(j)) + spk2utt.update(load_scp(spk2utt_path)) + spk2utt_path = os.path.join(self.classify_directory, 'spk2utt') + utt2spk_path = os.path.join(self.classify_directory, 'utt2spk') + for speak, utts in spk2utt.items(): + if not isinstance(utts, list): + spk2utt[speak] = [utts] + counts[speak] = len(spk2utt[speak]) + + if self.num_speakers: + valid_speakers = sorted(counts.keys(), key=lambda x: counts[x])[:self.num_speakers] + else: + valid_speakers = [x for x in counts.keys() if counts[x] > 1] + if not valid_speakers: # Only single utterance count speakers + valid_speakers = [x for x in counts.keys()] + reanalyze_utts = [] + for speak, c in counts.items(): + if c == 1 or speak not in valid_speakers: + utts = spk2utt[speak] + for u in utts: + reanalyze_utts.append(u) + + spk2utt = {k: v for k, v in spk2utt.items() if k in valid_speakers} + new_utt2spk = self.classify_utterances(reanalyze_utts, valid_speakers) + for u, spk in new_utt2spk.items(): + utt2spk[u] = spk + spk2utt[spk].append(u) + save_scp(([k, v] for k,v in spk2utt.items()), spk2utt_path) + save_scp(([k, v] for k,v in utt2spk.items()), utt2spk_path) + self.logger.debug('Analyzing stats and reclassification took {} seconds'.format(time.time() - begin)) + + def export_classification(self, output_directory): + if self.cluster: + self.cluster_utterances() + else: + self.get_classification_stats() + from decimal import Decimal + from textgrid import TextGrid, IntervalTier + spk2utt_path = os.path.join(self.classify_directory, 'spk2utt') + utt2spk_path = os.path.join(self.classify_directory, 'utt2spk') + if self.corpus.segments: + utt2spk = load_scp(utt2spk_path) + file_dict = {} + for utt, segment in self.corpus.segments.items(): + + filename, utt_begin, utt_end = segment.split(' ') + utt_begin = Decimal(utt_begin) + utt_end = Decimal(utt_end) + if filename not in file_dict: + file_dict[filename] = {} + speaker = utt2spk[utt] + text = self.corpus.text_mapping[utt] + if speaker not in file_dict[filename]: + file_dict[filename][speaker] = [] + file_dict[filename][speaker].append([utt_begin, utt_end, text]) + for filename, speaker_dict in file_dict.items(): + try: + speaker_directory = os.path.join(output_directory, self.corpus.file_directory_mapping[filename]) + except KeyError: + speaker_directory = output_directory + max_time = self.corpus.get_wav_duration(filename) + tg = TextGrid(maxTime=max_time) + for speaker in sorted(speaker_dict.keys()): + words = speaker_dict[speaker] + tier = IntervalTier(name=speaker, maxTime=max_time) + for w in words: + if w[1] > max_time: + w[1] = max_time + tier.add(*w) + tg.append(tier) + tg.write(os.path.join(speaker_directory, filename + '.TextGrid')) + + else: + spk2utt = load_scp(spk2utt_path) + for speaker, utts in spk2utt.items(): + speaker_dir = os.path.join(output_directory, speaker) + os.makedirs(speaker_dir, exist_ok=True) + with open(os.path.join(speaker_dir, 'utterances.txt'), 'w', encoding='utf8') as f: + for u in utts: + f.write('{}\n'.format(u)) \ No newline at end of file diff --git a/montreal_forced_aligner/thirdparty/kaldi.py b/montreal_forced_aligner/thirdparty/kaldi.py index b649815b..c4088f03 100644 --- a/montreal_forced_aligner/thirdparty/kaldi.py +++ b/montreal_forced_aligner/thirdparty/kaldi.py @@ -17,21 +17,19 @@ open_blas_library = 'libopenblas.so.0' alignment_filenames = ['acc-lda', 'acc-tree-stats', 'add-deltas', 'ali-to-pdf', 'ali-to-post', 'align-equal-compiled', - 'append-vector-to-feats', 'apply-cmvn', 'build-tree', 'cluster-phones', 'compile-questions', - 'compile-train-graphs', 'compile-train-graphs-fsts', 'compose-transforms', 'compute-cmvn-stats', - 'compute-mfcc-feats', 'convert-ali', 'copy-feats', 'est-lda', 'est-mllt', - 'extract-segments', 'feat-to-dim', 'feat-to-len', 'gmm-acc-mllt', 'gmm-acc-stats-ali', - 'gmm-align-compiled', - 'gmm-boost-silence', 'gmm-est', 'gmm-est-fmllr', 'gmm-global-acc-stats', 'gmm-global-est', - 'gmm-global-get-post', 'gmm-global-init-from-feats', 'gmm-global-sum-accs', 'gmm-global-to-fgmm', - 'gmm-gselect', 'gmm-info', 'gmm-init-model', 'gmm-init-mono', 'gmm-latgen-faster', 'gmm-mixup', - 'gmm-sum-accs', 'gmm-transform-means', 'ivector-extract', 'ivector-extractor-acc-stats', - 'ivector-extractor-est', 'ivector-extractor-init', 'ivector-extractor-sum-accs', - 'lattice-align-words', 'lattice-oracle', 'lattice-to-phone-lattice', 'linear-to-nbest', - 'nbest-to-ctm', 'paste-feats', 'post-to-weights', 'scale-post', 'select-feats', - 'show-transitions', - 'splice-feats', 'subsample-feats', 'sum-lda-accs', 'sum-tree-stats', 'transform-feats', - 'tree-info', 'weight-silence-post'] + 'append-vector-to-feats', 'apply-cmvn', 'build-tree', 'cluster-phones', 'compile-questions', + 'compile-train-graphs', 'compile-train-graphs-fsts', 'compose-transforms', 'compute-cmvn-stats', + 'compute-mfcc-feats', 'convert-ali', 'copy-feats', 'est-lda', 'est-mllt', + 'extract-segments', 'feat-to-dim', 'feat-to-len', 'gmm-acc-mllt', 'gmm-acc-stats-ali', + 'gmm-align-compiled', + 'gmm-boost-silence', 'gmm-est', 'gmm-est-fmllr', 'gmm-info', 'gmm-init-model', 'gmm-init-mono', + 'gmm-latgen-faster', 'gmm-mixup', + 'gmm-sum-accs', 'gmm-transform-means', + 'lattice-align-words', 'lattice-oracle', 'lattice-to-phone-lattice', 'linear-to-nbest', + 'nbest-to-ctm', 'paste-feats', 'post-to-weights', 'select-feats', + 'show-transitions', + 'splice-feats', 'sum-lda-accs', 'sum-tree-stats', 'transform-feats', + 'tree-info', 'weight-silence-post', 'subset-feats'] train_dict_filenames = ['nbest-to-prons'] @@ -41,25 +39,35 @@ 'add-self-loops', 'lattice-scale', 'lattice-add-penalty', 'lattice-best-path', 'lattice-to-post', 'gmm-post-to-gpost', 'gmm-est-fmllr-gpost', 'lattice-determinize-pruned', 'gmm-rescore-lattice', 'gmm-latgen-faster-parallel', 'lattice-determinize-pruned-parallel' - ] - -included_filenames = alignment_filenames + train_dict_filenames + transcribe_filenames +] + +speaker_diarization_filenames = [ + 'compute-vad', 'apply-cmvn-sliding', + 'subsample-feats', 'scale-post', + 'ivector-extract', 'ivector-extractor-acc-stats', 'ivector-extractor-est', 'ivector-extractor-init', + 'ivector-extractor-sum-accs', 'gmm-global-acc-stats', 'gmm-global-est', + 'gmm-global-get-post', 'gmm-global-init-from-feats', 'gmm-global-sum-accs', 'gmm-global-to-fgmm', + 'gmm-gselect', 'select-voiced-frames', +] + +included_filenames = alignment_filenames + train_dict_filenames + transcribe_filenames + speaker_diarization_filenames if sys.platform == 'win32': - included_filenames += ['fstcompile', 'fstarcsort'] - -linux_libraries = ['libfst.so.13', 'libfstfar.so.13', - 'libfstscript.so.13', 'libfstfarscript.so.13', - 'libkaldi-hmm.so', 'libkaldi-util.so', - 'libkaldi-base.so', 'libkaldi-tree.so', - 'libkaldi-feat.so', 'libkaldi-transform.so', 'libkaldi-lm.so', - 'libkaldi-gmm.so', 'libkaldi-lat.so', 'libkaldi-decoder.so', - 'libkaldi-fstext.so', 'libkaldi-ivector.so'] + included_filenames += ['fstcompile', 'fstarcsort', 'fstconvert'] + +linux_libraries = [#'libfst.so.13', 'libfstfar.so.13', + #'libfstscript.so.13', 'libfstfarscript.so.13', + #'libkaldi-hmm.so', 'libkaldi-util.so', + #'libkaldi-base.so', 'libkaldi-tree.so', + #'libkaldi-feat.so', 'libkaldi-transform.so', 'libkaldi-lm.so', + #'libkaldi-gmm.so', 'libkaldi-lat.so', 'libkaldi-decoder.so', + #'libkaldi-fstext.so', 'libkaldi-ivector.so' +] included_libraries = {'linux': linux_libraries, - 'win32': [#'openfst64.dll', - 'libgcc_s_seh-1.dll', 'libgfortran-3.dll', - 'libquadmath-0.dll', 'libopenblas.dll'], - 'darwin': ['libfst.13.dylib', 'libfstfarscript.13.dylib', 'libfstscript.13.dylib', - 'libfstfar.13.dylib', 'libfstngram.13.dylib', + 'win32': [ # 'openfst64.dll', + 'libgcc_s_seh-1.dll', 'libgfortran-3.dll', + 'libquadmath-0.dll', 'libopenblas.dll'], + 'darwin': [#'libfst.13.dylib', 'libfstfarscript.13.dylib', 'libfstscript.13.dylib', + #'libfstfar.13.dylib', 'libfstngram.13.dylib', 'libkaldi-hmm.dylib', 'libkaldi-util.dylib', 'libkaldi-thread.dylib', 'libkaldi-base.dylib', 'libkaldi-tree.dylib', 'libkaldi-matrix.dylib', 'libkaldi-feat.dylib', 'libkaldi-transform.dylib', 'libkaldi-lm.dylib', @@ -87,9 +95,12 @@ def collect_kaldi_binaries(directory): if value == exe_ext: if key not in included_filenames: continue - shutil.copy(os.path.join(root, name), bin_out) - st = os.stat(bin_out) - os.chmod(bin_out, st.st_mode | stat.S_IEXEC) + try: + shutil.copy(os.path.join(root, name), bin_out) + st = os.stat(bin_out) + os.chmod(bin_out, st.st_mode | stat.S_IEXEC) + except OSError: + pass elif name in included_libraries[sys.platform]: shutil.copy(os.path.join(root, name), bin_out) else: diff --git a/montreal_forced_aligner/trainers/base.py b/montreal_forced_aligner/trainers/base.py index 52d2f29a..887d1379 100644 --- a/montreal_forced_aligner/trainers/base.py +++ b/montreal_forced_aligner/trainers/base.py @@ -1,14 +1,16 @@ import os import re +import time from tqdm import tqdm import subprocess import shutil from .. import __version__ -from ..exceptions import TrainerError -from ..helper import thirdparty_binary, make_path_safe +from ..exceptions import TrainerError, KaldiProcessingError +from ..helper import thirdparty_binary, make_path_safe, log_kaldi_errors -from ..multiprocessing import (align, acc_stats, convert_ali_to_textgrids, compute_alignment_improvement) +from ..multiprocessing import (align, acc_stats, convert_ali_to_textgrids, + compute_alignment_improvement, compile_train_graphs) from ..models import AcousticModel from ..features.config import FeatureConfig @@ -44,6 +46,7 @@ class BaseTrainer(object): """ def __init__(self, default_feature_config): + self.logger = None self.transition_scale = 1.0 self.acoustic_scale = 0.1 self.self_loop_scale = 0.1 @@ -107,6 +110,11 @@ def final_gaussian_iteration(self): def gaussian_increment(self): return int((self.max_gaussians - self.initial_gaussians) / self.final_gaussian_iteration) + @property + def align_options(self): + return {'beam': self.beam, 'retry_beam': self.retry_beam, 'transition_scale': self.transition_scale, + 'acoustic_scale':self.acoustic_scale, 'self_loop_scale': self.self_loop_scale} + def update(self, data): for k, v in data.items(): if k == 'use_mp': @@ -119,10 +127,19 @@ def update(self, data): setattr(self, k, v) self.compute_calculated_properties() - def _setup_for_init(self, identifier, temporary_directory, corpus, dictionary): - print('Initializing training for {}...'.format(identifier)) + def _setup_for_init(self, identifier, temporary_directory, corpus, dictionary, logger=None): + begin = time.time() self.temp_directory = temporary_directory self.identifier = identifier + dirty_path = os.path.join(self.train_directory, 'dirty') + done_path = os.path.join(self.align_directory, 'done') + if os.path.exists(dirty_path): # if there was an error, let's redo from scratch + shutil.rmtree(self.train_directory) + if os.path.exists(done_path): + return + if self.logger is None and logger is not None: + self.logger = logger + self.logger.info('Initializing training for {}...'.format(identifier)) self.corpus = corpus self.dictionary = dictionary os.makedirs(self.train_directory, exist_ok=True) @@ -130,11 +147,19 @@ def _setup_for_init(self, identifier, temporary_directory, corpus, dictionary): os.makedirs(self.log_directory, exist_ok=True) os.makedirs(self.align_log_directory, exist_ok=True) if self.subset is not None and self.subset > corpus.num_utterances: - print('Warning: Subset specified is larger than the dataset, using full corpus for this training block.') - self.data_directory = corpus.split_directory() - self.feature_config.generate_features(self.corpus) - if self.subset is not None: - self.data_directory = corpus.subset_directory(self.subset, self.feature_config) + self.logger.warning('Subset specified is larger than the dataset, ' + 'using full corpus for this training block.') + + try: + self.data_directory = corpus.split_directory() + self.feature_config.generate_features(self.corpus, logger=self.logger) + if self.subset is not None: + self.data_directory = corpus.subset_directory(self.subset, self.feature_config) + except Exception as e: + if isinstance(e, KaldiProcessingError): + log_kaldi_errors(e.error_logs, self.logger) + raise + self.logger.debug('Setup for initialization took {} seconds'.format(time.time() - begin)) def init_training(self, identifier, temporary_directory, corpus, dictionary, previous_trainer): raise NotImplementedError @@ -186,82 +211,128 @@ def get_unaligned_utterances(self): return error_files def align(self, subset, call_back=None): - align('final', self.train_directory, self.data_directory, - self.dictionary.optional_silence_csl, - self.corpus.num_jobs, self, self.align_directory) - - log_dir = os.path.join(self.align_directory, 'log') - os.makedirs(log_dir, exist_ok=True) - - shutil.copy(os.path.join(self.train_directory, 'tree'), self.align_directory) - shutil.copyfile(os.path.join(self.train_directory, 'final.mdl'), - os.path.join(self.align_directory, 'final.mdl')) - - shutil.copyfile(os.path.join(self.train_directory, 'final.occs'), - os.path.join(self.align_directory, 'final.occs')) - self.save(os.path.join(self.align_directory, 'acoustic_model.zip')) - self.export_textgrids() + dirty_path = os.path.join(self.align_directory, 'dirty') + if os.path.exists(dirty_path): # if there was an error, let's redo from scratch + shutil.rmtree(self.align_directory) + done_path = os.path.join(self.align_directory, 'done') + if not os.path.exists(done_path): + message = 'Generating alignments using {} models'.format(self.identifier) + if subset: + message += ' using {} utterances...'.format(subset) + else: + message += ' for the whole corpus...' + self.logger.info(message) + begin = time.time() + self.logger.debug('Using {} as the feature name'.format(self.feature_file_base_name)) + if subset is None: + align_data_directory = self.corpus.split_directory() + else: + align_data_directory = self.corpus.subset_directory(subset, self.feature_config) + try: + log_dir = os.path.join(self.align_directory, 'log') + os.makedirs(log_dir, exist_ok=True) + shutil.copy(os.path.join(self.train_directory, 'tree'), self.align_directory) + shutil.copyfile(os.path.join(self.train_directory, 'final.mdl'), + os.path.join(self.align_directory, 'final.mdl')) + + if os.path.exists(os.path.join(self.train_directory, 'lda.mat')): + shutil.copyfile(os.path.join(self.train_directory, 'lda.mat'), + os.path.join(self.align_directory, 'lda.mat')) + shutil.copyfile(os.path.join(self.train_directory, 'final.occs'), + os.path.join(self.align_directory, 'final.occs')) + compile_train_graphs(self.align_directory, self.dictionary.output_directory, + align_data_directory, self.corpus.num_jobs, self) + align('final', self.align_directory, align_data_directory, + self.dictionary.optional_silence_csl, + self.corpus.num_jobs, self, self.align_directory) + self.save(os.path.join(self.align_directory, 'acoustic_model.zip')) + except Exception as e: + with open(dirty_path, 'w'): + pass + if isinstance(e, KaldiProcessingError): + log_kaldi_errors(e.error_logs, self.logger) + raise + with open(done_path, 'w'): + pass + self.logger.debug('Alignment took {} seconds'.format(time.time() - begin)) + else: + self.logger.info('Alignments using {} models already done'.format(self.identifier)) + if self.debug: + self.export_textgrids() def train(self, call_back=None): - final_mdl_path = os.path.join(self.train_directory, 'final.mdl') - if os.path.exists(final_mdl_path): - print('{} training already done, skipping.'.format(self.identifier)) + done_path = os.path.join(self.train_directory, 'done') + dirty_path = os.path.join(self.train_directory, 'dirty') + if os.path.exists(done_path): + self.logger.info('{} training already done, skipping initialization.'.format(self.identifier)) return + begin = time.time() + final_mdl_path = os.path.join(self.train_directory, 'final.mdl') num_gauss = self.initial_gaussians if call_back == print: iters = tqdm(range(1, self.num_iterations)) else: iters = range(1, self.num_iterations) - for i in iters: - model_path = os.path.join(self.train_directory, '{}.mdl'.format(i)) - occs_path = os.path.join(self.train_directory, '{}.occs'.format(i + 1)) - next_model_path = os.path.join(self.train_directory, '{}.mdl'.format(i + 1)) - if os.path.exists(next_model_path): - continue - if i in self.realignment_iterations: - align(i, self.train_directory, self.data_directory, - self.dictionary.optional_silence_csl, - self.corpus.num_jobs, self) - if self.debug: - compute_alignment_improvement(i, self, self.train_directory, self.corpus.num_jobs) - - acc_stats(i, self.train_directory, self.data_directory, self.corpus.num_jobs, self) - log_path = os.path.join(self.log_directory, 'update.{}.log'.format(i)) - with open(log_path, 'w') as logf: - acc_files = [os.path.join(self.train_directory, '{}.{}.acc'.format(i, x)) - for x in range(self.corpus.num_jobs)] - est_proc = subprocess.Popen([thirdparty_binary('gmm-est'), - '--write-occs=' + occs_path, - '--mix-up=' + str(num_gauss), '--power=' + str(self.power), - model_path, - "{} - {}|".format(thirdparty_binary('gmm-sum-accs'), - ' '.join(map(make_path_safe, acc_files))), - next_model_path], - stderr=logf) - est_proc.communicate() - if not self.debug: - for f in acc_files: - os.remove(f) - if not os.path.exists(next_model_path): - raise(Exception('There was an error training in iteration {}, please check the logs.'.format(i))) - self.parse_log_directory(self.log_directory, i, self.corpus.num_jobs, call_back) - if i < self.final_gaussian_iteration: - num_gauss += self.gaussian_increment - shutil.copy(os.path.join(self.train_directory, '{}.mdl'.format(self.num_iterations)), - final_mdl_path) - shutil.copy(os.path.join(self.train_directory, '{}.occs'.format(self.num_iterations)), - os.path.join(self.train_directory, 'final.occs')) - if not self.debug: - for i in range(1, self.num_iterations): + try: + for i in iters: model_path = os.path.join(self.train_directory, '{}.mdl'.format(i)) - try: - os.remove(model_path) - except FileNotFoundError: - pass - try: - os.remove(os.path.join(self.train_directory, '{}.occs'.format(i))) - except FileNotFoundError: - pass + occs_path = os.path.join(self.train_directory, '{}.occs'.format(i + 1)) + next_model_path = os.path.join(self.train_directory, '{}.mdl'.format(i + 1)) + if os.path.exists(next_model_path): + continue + if i in self.realignment_iterations: + align(i, self.train_directory, self.data_directory, + self.dictionary.optional_silence_csl, + self.corpus.num_jobs, self) + if self.debug: + compute_alignment_improvement(i, self, self.train_directory, self.corpus.num_jobs) + acc_stats(i, self.train_directory, self.data_directory, self.corpus.num_jobs, self) + log_path = os.path.join(self.log_directory, 'update.{}.log'.format(i)) + with open(log_path, 'w') as logf: + acc_files = [os.path.join(self.train_directory, '{}.{}.acc'.format(i, x)) + for x in range(self.corpus.num_jobs)] + est_proc = subprocess.Popen([thirdparty_binary('gmm-est'), + '--write-occs=' + occs_path, + '--mix-up=' + str(num_gauss), '--power=' + str(self.power), + model_path, + "{} - {}|".format(thirdparty_binary('gmm-sum-accs'), + ' '.join(map(make_path_safe, acc_files))), + next_model_path], + stderr=logf) + est_proc.communicate() + if not self.debug: + for f in acc_files: + os.remove(f) + if not os.path.exists(next_model_path): + raise(Exception('There was an error training in iteration {}, please check the logs.'.format(i))) + self.parse_log_directory(self.log_directory, i, self.corpus.num_jobs, call_back) + if i < self.final_gaussian_iteration: + num_gauss += self.gaussian_increment + shutil.copy(os.path.join(self.train_directory, '{}.mdl'.format(self.num_iterations)), + final_mdl_path) + shutil.copy(os.path.join(self.train_directory, '{}.occs'.format(self.num_iterations)), + os.path.join(self.train_directory, 'final.occs')) + if not self.debug: + for i in range(1, self.num_iterations): + model_path = os.path.join(self.train_directory, '{}.mdl'.format(i)) + try: + os.remove(model_path) + except FileNotFoundError: + pass + try: + os.remove(os.path.join(self.train_directory, '{}.occs'.format(i))) + except FileNotFoundError: + pass + except Exception as e: + with open(dirty_path, 'w'): + pass + if isinstance(e, KaldiProcessingError): + log_kaldi_errors(e.error_logs, self.logger) + raise + with open(done_path, 'w'): + pass + self.logger.info('Training complete!') + self.logger.debug('Training took {} seconds'.format(time.time() - begin)) @property def meta(self): @@ -276,9 +347,15 @@ def export_textgrids(self): """ Export a TextGrid file for every sound file in the dataset """ - - convert_ali_to_textgrids(self, os.path.join(self.align_directory, 'textgrids'), self.align_directory, + begin = time.time() + try: + convert_ali_to_textgrids(self, os.path.join(self.align_directory, 'textgrids'), self.align_directory, self.dictionary, self.corpus, self.corpus.num_jobs, self) + except Exception as e: + if isinstance(e, KaldiProcessingError): + log_kaldi_errors(e.error_logs, self.logger) + raise + self.logger.debug('Exporting textgrids took {} seconds'.format(time.time() - begin)) def save(self, path): """ diff --git a/montreal_forced_aligner/trainers/ivector_extractor.py b/montreal_forced_aligner/trainers/ivector_extractor.py index bc15047d..e9f55a16 100644 --- a/montreal_forced_aligner/trainers/ivector_extractor.py +++ b/montreal_forced_aligner/trainers/ivector_extractor.py @@ -2,11 +2,14 @@ from tqdm import tqdm import subprocess import shutil +import time from .base import BaseTrainer -from ..helper import thirdparty_binary, make_path_safe +from ..helper import thirdparty_binary, make_path_safe, log_kaldi_errors, parse_logs, load_scp +from ..exceptions import KaldiProcessingError -from ..multiprocessing import (gmm_gselect, acc_global_stats, gauss_to_post, acc_ivector_stats, extract_ivectors) +from ..multiprocessing.ivector import (gmm_gselect, acc_global_stats, gauss_to_post, + acc_ivector_stats, extract_ivectors) from ..models import IvectorExtractor @@ -40,163 +43,248 @@ def __init__(self, default_feature_config): self.ubm_num_iterations = 4 self.ubm_num_gselect = 30 - self.ubm_num_frames = 400000 + self.ubm_num_frames = 500000 self.ubm_num_gaussians = 256 - self.ubm_num_iterations_init = 20 self.ubm_initial_gaussian_proportion = 0.5 - self.ubm_cleanup = True self.ubm_min_gaussian_weight = 0.0001 self.ubm_remove_low_count_gaussians = True - self.ubm_num_threads = 32 - self.ivector_dimension = 100 - self.ivector_period = 10 + self.ivector_dimension = 128 self.num_iterations = 10 - self.num_gselect = 5 - self.posterior_scale = 0.1 + self.num_gselect = 20 + self.posterior_scale = 1.0 self.silence_weight = 0.0 - self.splice_left_context = 3 - self.splice_right_context = 3 self.min_post = 0.025 self.gaussian_min_count = 100 - self.subsample = 2 + self.subsample = 5 self.max_count = 100 + self.apply_cmn = True + self.previous_align_directory = None @property def meta(self): - return {'ivector_period': self.ivector_period, - 'splice_left_context': self.splice_left_context, - 'splice_right_context': self.splice_right_context, - 'num_gselect': self.num_gselect, - 'min_post': self.min_post, - 'posterior_scale': self.posterior_scale, - } + from .. import __version__ + return { + 'version': __version__, + 'ivector_dimension': self.ivector_dimension, + 'apply_cmn': self.apply_cmn, + 'num_gselect': self.num_gselect, + 'min_post': self.min_post, + 'posterior_scale': self.posterior_scale, + 'features': self.feature_config.params(), + } @property def train_type(self): return 'ivector' - def init_training(self, identifier, temporary_directory, corpus, dictionary, previous_trainer): - self._setup_for_init(identifier, temporary_directory, corpus, dictionary) - for f in os.listdir(previous_trainer.align_directory): - if os.path.isdir(os.path.join(previous_trainer.align_directory, f)): - continue - shutil.copy(os.path.join(previous_trainer.align_directory, f), os.path.join(self.align_directory, f)) - corpus_directory = self.corpus.output_directory - lda_mat_path = os.path.join(corpus_directory, 'lda.mat') - if os.path.exists(lda_mat_path): - shutil.copy(lda_mat_path, os.path.join(self.train_directory, 'lda.mat')) + @property + def ivector_options(self): + return {'subsample': self.subsample, 'num_gselect': self.num_gselect, 'posterior_scale': self.posterior_scale, + 'min_post': self.min_post, 'silence_weight': self.silence_weight, 'max_count': self.max_count, + 'ivector_dimension': self.ivector_dimension + } + def train_ubm(self, call_back=None): + if call_back is None: + call_back = print + # train diag ubm + final_ubm_path = os.path.join(self.train_directory, 'final.ubm') + if os.path.exists(final_ubm_path): + return + begin = time.time() + self.logger.info('Initializing diagonal UBM...') # Initialize model from E-M in memory + log_directory = os.path.join(self.train_directory, 'log') num_gauss_init = int(self.ubm_initial_gaussian_proportion * int(self.ubm_num_gaussians)) - log_path = os.path.join(self.train_directory, 'log', 'gmm_init.log') - - all_feats_path = os.path.join(self.corpus.output_directory, self.feature_config.feature_id + '.scp') + log_path = os.path.join(log_directory, 'gmm_init.log') + feat_name = self.feature_file_base_name + all_feats_path = os.path.join(self.corpus.output_directory, feat_name + '.scp') + feature_string = self.feature_config.construct_feature_proc_string(self.corpus.output_directory, + self.train_directory, + job_name=None, cmvn=self.apply_cmn) with open(all_feats_path, 'w') as outf: for i in range(self.corpus.num_jobs): with open(os.path.join(self.data_directory, - self.feature_config.feature_id + '.{}.scp'.format(i))) as inf: + feat_name + '.{}.scp'.format(i))) as inf: for line in inf: outf.write(line) - with open(log_path, 'w') as logf: - + with open(log_path, 'w') as log_file: gmm_init_proc = subprocess.Popen([thirdparty_binary('gmm-global-init-from-feats'), - '--num-threads=' + str(self.ubm_num_threads), - '--num-frames=' + str(self.ubm_num_frames), - '--num_gauss=' + str(self.ubm_num_gaussians), - '--num_gauss_init=' + str(num_gauss_init), - '--num_iters=' + str(self.ubm_num_iterations_init), - 'scp:' + all_feats_path, - os.path.join(self.train_directory, '1.dubm')], - stderr=logf) + '--num-threads={}'.format(self.corpus.num_jobs), + '--num-frames={}'.format(self.ubm_num_frames), + '--num_gauss={}'.format(self.ubm_num_gaussians), + '--num_gauss_init={}'.format(num_gauss_init), + '--num_iters={}'.format(self.ubm_num_iterations_init), + feature_string, + os.path.join(self.train_directory, '0.dubm')], + stderr=log_file) gmm_init_proc.communicate() # Store Gaussian selection indices on disk - gmm_gselect(self, self.corpus.num_jobs) - - for i in range(1, self.ubm_num_iterations): - # Accumulate stats - acc_global_stats(self, self.corpus.num_jobs, i) + gmm_gselect('0', self, self.corpus.num_jobs) + final_dubm_path = os.path.join(self.train_directory, 'final.dubm') - # Don't remove low-count Gaussians till the last tier, - # or gselect info won't be valid anymore - if i < self.ubm_num_iterations - 1: - opt = '--remove-low-count-gaussians=false' + if not os.path.exists(final_dubm_path): + self.logger.info('Training diagonal UBM...') + if call_back == print: + iters = tqdm(range(0, self.ubm_num_iterations)) else: - opt = '--remove-low-count-gaussians=' + str(self.ubm_remove_low_count_gaussians) - - log_path = os.path.join(self.train_directory, 'log', 'update.{}.log'.format(i)) - with open(log_path, 'w') as logf: - acc_files = [os.path.join(self.train_directory, '{}.{}.acc'.format(i, x)) - for x in range(self.corpus.num_jobs)] - gmm_global_est_proc = subprocess.Popen([thirdparty_binary('gmm-global-est'), - opt, - '--min-gaussian-weight=' + str(self.ubm_min_gaussian_weight), - os.path.join(self.train_directory, '{}.dubm'.format(i)), - "{} - {}|".format(thirdparty_binary('gmm-global-sum-accs'), - ' '.join(map(make_path_safe, acc_files))), - os.path.join(self.train_directory, '{}.dubm'.format(i + 1))], - stderr=logf) - gmm_global_est_proc.communicate() - - # Move files - shutil.copy(os.path.join(self.train_directory, '{}.dubm'.format(self.ubm_num_iterations)), - os.path.join(self.train_directory, 'final.dubm')) - - # Convert final.ubm to fgmm - log_path = os.path.join(self.train_directory, 'log', 'global_to_fgmm.log') - with open(log_path, 'w') as logf: - subprocess.call([thirdparty_binary('gmm-global-to-fgmm'), - os.path.join(self.train_directory, 'final.dubm'), - os.path.join(self.train_directory, '0.fgmm')], - stdout=subprocess.PIPE, - stderr=logf) + iters = range(0, self.ubm_num_iterations) + for i in iters: + # Accumulate stats + acc_global_stats(self, self.corpus.num_jobs, i) + + # Don't remove low-count Gaussians till the last tier, + # or gselect info won't be valid anymore + if i < self.ubm_num_iterations - 1: + opt = '--remove-low-count-gaussians=false' + else: + opt = '--remove-low-count-gaussians={}'.format(self.ubm_remove_low_count_gaussians) + + log_path = os.path.join(self.train_directory, 'log', 'update.{}.log'.format(i)) + with open(log_path, 'w') as log_file: + acc_files = [os.path.join(self.train_directory, '{}.{}.acc'.format(i, x)) + for x in range(self.corpus.num_jobs)] + gmm_global_est_proc = subprocess.Popen([thirdparty_binary('gmm-global-est'), + opt, + '--min-gaussian-weight=' + str(self.ubm_min_gaussian_weight), + os.path.join(self.train_directory, '{}.dubm'.format(i)), + "{} - {}|".format(thirdparty_binary('gmm-global-sum-accs'), + ' '.join(map(make_path_safe, acc_files))), + os.path.join(self.train_directory, '{}.dubm'.format(i + 1))], + stderr=log_file) + gmm_global_est_proc.communicate() + # Clean up + for p in acc_files: + os.remove(p) + # Move files + shutil.copy(os.path.join(self.train_directory, '{}.dubm'.format(self.ubm_num_iterations)), + final_dubm_path) + + parse_logs(log_directory) + self.logger.info('Finished training UBM!') + self.logger.debug('UBM training took {} seconds'.format(time.time() - begin)) + + def init_training(self, identifier, temporary_directory, corpus, dictionary, previous_trainer=None): + self._setup_for_init(identifier, temporary_directory, corpus, dictionary) + done_path = os.path.join(self.train_directory, 'done') + dirty_path = os.path.join(self.train_directory, 'dirty') + if os.path.exists(done_path): + self.logger.info('{} training already done, skipping initialization.'.format(self.identifier)) + return + begin = time.time() + self.previous_align_directory = previous_trainer.align_directory + + self.train_ubm() + self.init_ivector_train() + self.logger.info('Initialization complete!') + self.logger.debug('Initialization took {} seconds'.format(time.time() - begin)) + + def init_ivector_train(self): + init_ie_path = os.path.join(self.train_directory, '0.ie') + if os.path.exists(init_ie_path): + return + begin = time.time() # Initialize i-vector extractor - log_path = os.path.join(self.train_directory, 'log', 'init.log') - with open(log_path, 'w') as logf: + log_directory = os.path.join(self.train_directory, 'log') + log_path = os.path.join(log_directory, 'init.log') + diag_ubm_path = os.path.join(self.train_directory, 'final.dubm') + full_ubm_path = os.path.join(self.train_directory, 'final.ubm') + with open(log_path, 'w') as log_file: + subprocess.call([thirdparty_binary('gmm-global-to-fgmm'), + diag_ubm_path, + full_ubm_path], + stderr=log_file) subprocess.call([thirdparty_binary('ivector-extractor-init'), '--ivector-dim=' + str(self.ivector_dimension), '--use-weights=false', - os.path.join(self.train_directory, '0.fgmm'), - os.path.join(self.train_directory, '1.ie')], - stderr=logf) + full_ubm_path, + init_ie_path], + stderr=log_file) + # Do Gaussian selection and posterior extraction gauss_to_post(self, self.corpus.num_jobs) - print('Initialization complete!') + parse_logs(log_directory) + self.logger.debug('Initialization ivectors took {} seconds'.format(time.time() - begin)) def align(self, subset, call_back=None): self.save(os.path.join(self.align_directory, 'ivector_extractor.zip')) - extract_ivectors(self, self.corpus.num_jobs) + # extract_ivectors(self, self.corpus.num_jobs) def train(self, call_back=None): + from sklearn.naive_bayes import GaussianNB + from joblib import dump, load + import numpy as np + done_path = os.path.join(self.train_directory, 'done') + dirty_path = os.path.join(self.train_directory, 'dirty') + if os.path.exists(done_path): + self.logger.info('{} training already done, skipping training.'.format(self.identifier)) + return + begin = time.time() if call_back == print: - iters = tqdm(range(1, self.num_iterations)) + iters = tqdm(range(0, self.num_iterations)) else: - iters = range(1, self.num_iterations) - for i in iters: - # Accumulate stats and sum - acc_ivector_stats(self, self.corpus.num_jobs, i) - - # Est extractor - log_path = os.path.join(self.train_directory, 'log', 'update.{}.log'.format(i)) - with open(log_path, 'w') as logf: - extractor_est_proc = subprocess.Popen([thirdparty_binary('ivector-extractor-est'), - '--gaussian-min-count={}'.format(self.gaussian_min_count), - os.path.join(self.train_directory, '{}.ie'.format(i)), - os.path.join(self.train_directory, 'acc.{}'.format(i)), - os.path.join(self.train_directory, '{}.ie'.format(i + 1))], - stderr=logf) - extractor_est_proc.communicate() - # Rename to final - shutil.copy(os.path.join(self.train_directory, '{}.ie'.format(self.num_iterations)), - os.path.join(self.train_directory, 'final.ie')) - os.makedirs(self.corpus.ivector_directory, exist_ok=True) - shutil.copy(os.path.join(self.train_directory, 'final.ie'), os.path.join(self.corpus.ivector_directory, 'final.ie')) - shutil.copy(os.path.join(self.train_directory, 'final.dubm'), os.path.join(self.corpus.ivector_directory, 'final.dubm')) + iters = range(0, self.num_iterations) + try: + log_dir = os.path.join(self.train_directory, 'log') + if not os.path.exists(os.path.join(self.train_directory, 'final.ie')): + for i in iters: + # Accumulate stats and sum + acc_ivector_stats(self, self.corpus.num_jobs, i) + + # Est extractor + log_path = os.path.join(log_dir, 'update.{}.log'.format(i)) + with open(log_path, 'w') as log_file: + extractor_est_proc = subprocess.Popen([thirdparty_binary('ivector-extractor-est'), + '--num-threads={}'.format(self.corpus.num_jobs), + '--gaussian-min-count={}'.format(self.gaussian_min_count), + os.path.join(self.train_directory, '{}.ie'.format(i)), + os.path.join(self.train_directory, 'acc.{}'.format(i)), + os.path.join(self.train_directory, '{}.ie'.format(i + 1))], + stderr=log_file) + extractor_est_proc.communicate() + + # Rename to final + shutil.copy(os.path.join(self.train_directory, '{}.ie'.format(self.num_iterations)), + os.path.join(self.train_directory, 'final.ie')) + extract_ivectors(self.train_directory, self.corpus.split_directory(), self, self.corpus.num_jobs) + x = [] + y = [] + speakers = sorted(self.corpus.speak_utt_mapping.keys()) + for i in range(self.corpus.num_jobs): + ivec = load_scp(os.path.join(self.train_directory, 'ivectors.{}'.format(i))) + for utt, ivector in ivec.items(): + ivector = [float(x) for x in ivector] + s = self.corpus.utt_speak_mapping[utt] + s_ind = speakers.index(s) + y.append(s_ind) + x.append(ivector) + x = np.array(x) + y = np.array(y) + clf = GaussianNB() + clf.fit(x, y) + clf_param_path = os.path.join(self.train_directory, 'speaker_classifier.mdl') + dump(clf, clf_param_path) + classes_path = os.path.join(self.train_directory, 'speaker_labels.txt') + with open(classes_path, 'w', encoding='utf8') as f: + for i, s in enumerate(speakers): + f.write('{} {}\n'.format(s, i)) + + except Exception as e: + with open(dirty_path, 'w'): + pass + if isinstance(e, KaldiProcessingError): + log_kaldi_errors(e.error_logs, self.logger) + raise + with open(done_path, 'w'): + pass + self.logger.info('Training complete!') + self.logger.debug('Training took {} seconds'.format(time.time() - begin)) def save(self, path): """ diff --git a/montreal_forced_aligner/trainers/lda.py b/montreal_forced_aligner/trainers/lda.py index 941bb545..0ab0bb12 100644 --- a/montreal_forced_aligner/trainers/lda.py +++ b/montreal_forced_aligner/trainers/lda.py @@ -2,9 +2,11 @@ from tqdm import tqdm import subprocess import shutil +import time from ..multiprocessing import (align, acc_stats, calc_lda_mllt, lda_acc_stats, compute_alignment_improvement) -from ..helper import thirdparty_binary, make_path_safe, filter_scp +from ..helper import thirdparty_binary, make_path_safe, filter_scp, log_kaldi_errors, parse_logs +from ..exceptions import KaldiProcessingError from .triphone import TriphoneTrainer @@ -51,100 +53,124 @@ def compute_calculated_properties(self): def train_type(self): return 'lda' + @property + def lda_options(self): + return {'lda_dimension': self.lda_dimension, 'boost_silence': self.boost_silence, + 'random_prune': self.random_prune} + def init_training(self, identifier, temporary_directory, corpus, dictionary, previous_trainer): self._setup_for_init(identifier, temporary_directory, corpus, dictionary) - - self.feature_config.lda = True - self.feature_config.deltas = False - self.feature_config.directory = None - self.feature_config.generate_features(self.corpus, overwrite=True) - lda_acc_stats(self.train_directory, self.data_directory, previous_trainer.align_directory, self, - self.dictionary.silence_csl, self.corpus.num_jobs) - self.feature_config.directory = self.train_directory - self.feature_config.generate_features(self.corpus, overwrite=True) - if self.data_directory != self.corpus.split_directory(): - utt_list = [] - subset_utt_path = os.path.join(self.data_directory, 'included_utts.txt') - with open(subset_utt_path, 'r') as f: - for line in f: - utt_list.append(line.strip()) - for j in range(self.corpus.num_jobs): - base_path = os.path.join(corpus.split_directory(), self.feature_config.feature_id + '.{}.scp'.format(j)) - subset_scp = os.path.join(self.data_directory, self.feature_config.feature_id + '.{}.scp'.format(j)) - filtered = filter_scp(utt_list, base_path) - with open(subset_scp, 'w') as f: - for line in filtered: - f.write(line.strip() + '\n') - super(LdaTrainer, self).init_training(identifier, temporary_directory, corpus, dictionary, previous_trainer) - print('Initialization complete!') + done_path = os.path.join(self.train_directory, 'done') + dirty_path = os.path.join(self.train_directory, 'dirty') + if os.path.exists(done_path): + self.logger.info('{} training already done, skipping initialization.'.format(self.identifier)) + return + begin = time.time() + try: + self.feature_config.directory = None + lda_acc_stats(self.train_directory, self.data_directory, previous_trainer.align_directory, self, + self.dictionary.silence_csl, self.corpus.num_jobs) + self.feature_config.directory = self.train_directory + if self.data_directory != self.corpus.split_directory(): + utt_list = [] + subset_utt_path = os.path.join(self.data_directory, 'included_utts.txt') + with open(subset_utt_path, 'r') as f: + for line in f: + utt_list.append(line.strip()) + #for j in range(self.corpus.num_jobs): + # base_path = os.path.join(corpus.split_directory(), self.feature_config.feature_id + '.{}.scp'.format(j)) + # subset_scp = os.path.join(self.data_directory, self.feature_config.feature_id + '.{}.scp'.format(j)) + # filtered = filter_scp(utt_list, base_path) + # with open(subset_scp, 'w') as f: + # for line in filtered: + # f.write(line.strip() + '\n') + except Exception as e: + with open(dirty_path, 'w') as _: + pass + if isinstance(e, KaldiProcessingError): + log_kaldi_errors(e.error_logs, self.logger) + raise + self._setup_tree(previous_trainer.align_directory) + self.logger.info('Initialization complete!') + self.logger.debug('Initialization took {} seconds'.format(time.time() - begin)) def train(self, call_back=None): - final_mdl_path = os.path.join(self.train_directory, 'final.mdl') - if os.path.exists(final_mdl_path): - print('{} training already done, skipping.'.format(self.identifier)) + done_path = os.path.join(self.train_directory, 'done') + dirty_path = os.path.join(self.train_directory, 'dirty') + if os.path.exists(done_path): + self.logger.info('{} training already done, skipping.'.format(self.identifier)) return - num_gauss = self.initial_gaussians - if call_back == print: - iters = tqdm(range(1, self.num_iterations)) - else: - iters = range(1, self.num_iterations) - sil_phones = self.dictionary.silence_csl - for i in iters: - model_path = os.path.join(self.train_directory, '{}.mdl'.format(i)) - occs_path = os.path.join(self.train_directory, '{}.occs'.format(i + 1)) - next_model_path = os.path.join(self.train_directory, '{}.mdl'.format(i + 1)) - if os.path.exists(next_model_path): - continue - if i in self.realignment_iterations: - align(i, self.train_directory, self.data_directory, - self.dictionary.optional_silence_csl, - self.corpus.num_jobs, self) - if self.debug: - compute_alignment_improvement(i, self, self.train_directory, self.corpus.num_jobs) - if i in self.mllt_iterations: - calc_lda_mllt(self.train_directory, self.data_directory, sil_phones, - self.corpus.num_jobs, self, - initial=False, iteration=i) + begin = time.time() + try: + num_gauss = self.initial_gaussians + if call_back == print: + iters = tqdm(range(0, self.num_iterations)) + else: + iters = range(0, self.num_iterations) + sil_phones = self.dictionary.silence_csl + for i in iters: + model_path = os.path.join(self.train_directory, '{}.mdl'.format(i)) + occs_path = os.path.join(self.train_directory, '{}.occs'.format(i + 1)) + next_model_path = os.path.join(self.train_directory, '{}.mdl'.format(i + 1)) + if os.path.exists(next_model_path): + continue + if i in self.realignment_iterations: + align(i, self.train_directory, self.data_directory, + self.dictionary.optional_silence_csl, + self.corpus.num_jobs, self) + if self.debug: + compute_alignment_improvement(i, self, self.train_directory, self.corpus.num_jobs) + if i in self.mllt_iterations: + calc_lda_mllt(self.train_directory, self.data_directory, sil_phones, + self.corpus.num_jobs, self, + initial=False, iteration=i) - acc_stats(i, self.train_directory, self.data_directory, self.corpus.num_jobs, - self) - log_path = os.path.join(self.log_directory, 'update.{}.log'.format(i)) - with open(log_path, 'w') as logf: - acc_files = [os.path.join(self.train_directory, '{}.{}.acc'.format(i, x)) - for x in range(self.corpus.num_jobs)] - est_proc = subprocess.Popen([thirdparty_binary('gmm-est'), - '--write-occs=' + occs_path, - '--mix-up=' + str(num_gauss), '--power=' + str(self.power), - model_path, - "{} - {}|".format(thirdparty_binary('gmm-sum-accs'), - ' '.join(map(make_path_safe, acc_files))), - next_model_path], - stderr=logf) - est_proc.communicate() + acc_stats(i, self.train_directory, self.data_directory, self.corpus.num_jobs, + self) + log_path = os.path.join(self.log_directory, 'update.{}.log'.format(i)) + with open(log_path, 'w') as log_file: + acc_files = [os.path.join(self.train_directory, '{}.{}.acc'.format(i, x)) + for x in range(self.corpus.num_jobs)] + est_proc = subprocess.Popen([thirdparty_binary('gmm-est'), + '--write-occs=' + occs_path, + '--mix-up=' + str(num_gauss), '--power=' + str(self.power), + model_path, + "{} - {}|".format(thirdparty_binary('gmm-sum-accs'), + ' '.join(map(make_path_safe, acc_files))), + next_model_path], + stderr=log_file) + est_proc.communicate() + if not self.debug: + for f in acc_files: + os.remove(f) + self.parse_log_directory(self.log_directory, i, self.corpus.num_jobs, call_back) + if i < self.final_gaussian_iteration: + num_gauss += self.gaussian_increment + shutil.copy(os.path.join(self.train_directory, '{}.mdl'.format(self.num_iterations)), + os.path.join(self.train_directory, 'final.mdl')) + shutil.copy(os.path.join(self.train_directory, '{}.occs'.format(self.num_iterations)), + os.path.join(self.train_directory, 'final.occs')) + shutil.copy(os.path.join(self.train_directory, 'lda.mat'), + os.path.join(self.align_directory, 'lda.mat')) if not self.debug: - for f in acc_files: - os.remove(f) - self.parse_log_directory(self.log_directory, i, self.corpus.num_jobs, call_back) - compute_alignment_improvement(i, self, self.train_directory, self.corpus.num_jobs) - if i < self.final_gaussian_iteration: - num_gauss += self.gaussian_increment - shutil.copy(os.path.join(self.train_directory, '{}.mdl'.format(self.num_iterations)), - os.path.join(self.train_directory, 'final.mdl')) - shutil.copy(os.path.join(self.train_directory, '{}.occs'.format(self.num_iterations)), - os.path.join(self.train_directory, 'final.occs')) - shutil.copy(os.path.join(self.train_directory, 'lda.mat'), - os.path.join(self.corpus.output_directory, 'lda.mat')) - shutil.copy(os.path.join(self.train_directory, 'lda.mat'), - os.path.join(self.corpus.split_directory(), 'lda.mat')) - self.feature_config.generate_features(self.corpus, overwrite=True) - if not self.debug: - for i in range(1, self.num_iterations): - model_path = os.path.join(self.train_directory, '{}.mdl'.format(i)) - try: - os.remove(model_path) - except FileNotFoundError: - pass - try: - os.remove(os.path.join(self.train_directory, '{}.occs'.format(i))) - except FileNotFoundError: - pass + for i in range(1, self.num_iterations): + model_path = os.path.join(self.train_directory, '{}.mdl'.format(i)) + try: + os.remove(model_path) + except FileNotFoundError: + pass + try: + os.remove(os.path.join(self.train_directory, '{}.occs'.format(i))) + except FileNotFoundError: + pass + except Exception as e: + with open(dirty_path, 'w'): + pass + if isinstance(e, KaldiProcessingError): + log_kaldi_errors(e.error_logs, self.logger) + raise + with open(done_path, 'w'): + pass + + self.logger.info('Training complete!') + self.logger.debug('Training took {} seconds'.format(time.time() - begin)) diff --git a/montreal_forced_aligner/trainers/monophone.py b/montreal_forced_aligner/trainers/monophone.py index f6e9c990..67998ff4 100644 --- a/montreal_forced_aligner/trainers/monophone.py +++ b/montreal_forced_aligner/trainers/monophone.py @@ -1,9 +1,11 @@ import os import re import subprocess +import time from .base import BaseTrainer -from ..helper import thirdparty_binary, make_path_safe +from ..helper import thirdparty_binary, make_path_safe, log_kaldi_errors, parse_logs +from ..exceptions import KaldiProcessingError from ..multiprocessing import (mono_align_equal, compile_train_graphs, compute_alignment_improvement) @@ -61,42 +63,60 @@ def get_num_gauss(self): def init_training(self, identifier, temporary_directory, corpus, dictionary, previous_trainer=None): self._setup_for_init(identifier, temporary_directory, corpus, dictionary) + done_path = os.path.join(self.train_directory, 'done') + dirty_path = os.path.join(self.train_directory, 'dirty') + if os.path.exists(done_path): + self.logger.info('{} training already done, skipping initialization.'.format(self.identifier)) + return + begin = time.time() tree_path = os.path.join(self.train_directory, 'tree') mdl_path = os.path.join(self.train_directory, '0.mdl') - feat_dim = corpus.get_feat_dim(self.feature_config) - feat_path = os.path.join(self.data_directory, self.feature_config.feature_id + '.0.scp') - shared_phones_opt = "--shared-phones=" + os.path.join(dictionary.phones_dir, 'sets.int') - log_path = os.path.join(self.log_directory, 'init.log') - with open(log_path, 'w') as log_file: - subprocess.call([thirdparty_binary('gmm-init-mono'), shared_phones_opt, - "--train-feats=scp:"+feat_path, - os.path.join(dictionary.output_directory, 'topo'), - str(feat_dim), - mdl_path, - tree_path], - stderr=log_file) - num_gauss = self.get_num_gauss() - self.initial_gaussians = num_gauss - compile_train_graphs(self.train_directory, dictionary.output_directory, + try: + feat_dim = corpus.get_feat_dim(self.feature_config) + feature_string = self.feature_config.construct_feature_proc_string(self.data_directory, self.train_directory, 0) + #feature_string += " subset-feats --n=10 ark:- ark:-| " + shared_phones_opt = "--shared-phones=" + os.path.join(dictionary.phones_dir, 'sets.int') + log_path = os.path.join(self.log_directory, 'init.log') + with open(log_path, 'w') as log_file: + subprocess.call([thirdparty_binary('gmm-init-mono'), shared_phones_opt, + "--train-feats="+feature_string, + os.path.join(dictionary.output_directory, 'topo'), + str(feat_dim), + mdl_path, + tree_path], + stderr=log_file) + num_gauss = self.get_num_gauss() + self.initial_gaussians = num_gauss + compile_train_graphs(self.train_directory, dictionary.output_directory, + self.data_directory, corpus.num_jobs, self) + mono_align_equal(self.train_directory, self.data_directory, corpus.num_jobs, self) - mono_align_equal(self.train_directory, - self.data_directory, corpus.num_jobs, self) - log_path = os.path.join(self.train_directory, 'log', 'update.0.log') - with open(log_path, 'w') as log_file: - acc_files = [os.path.join(self.train_directory, '0.{}.acc'.format(x)) for x in range(corpus.num_jobs)] - est_proc = subprocess.Popen([thirdparty_binary('gmm-est'), - '--min-gaussian-occupancy=3', - '--mix-up={}'.format(num_gauss), '--power={}'.format(self.power), - mdl_path, "{} - {}|".format(thirdparty_binary('gmm-sum-accs'), - ' '.join(map(make_path_safe, acc_files))), - os.path.join(self.train_directory, '1.mdl')], - stderr=log_file) - est_proc.communicate() - if not self.debug: - for f in acc_files: - os.remove(f) - print('Initializing alignment improvement calculations') - compute_alignment_improvement(0, self, self.train_directory, self.corpus.num_jobs) - print('Initialization complete!') + log_path = os.path.join(self.train_directory, 'log', 'update.0.log') + with open(log_path, 'w') as log_file: + acc_files = [os.path.join(self.train_directory, '0.{}.acc'.format(x)) for x in range(corpus.num_jobs)] + est_proc = subprocess.Popen([thirdparty_binary('gmm-est'), + '--min-gaussian-occupancy=3', + '--mix-up={}'.format(num_gauss), '--power={}'.format(self.power), + mdl_path, "{} - {}|".format(thirdparty_binary('gmm-sum-accs'), + ' '.join(map(make_path_safe, acc_files))), + os.path.join(self.train_directory, '1.mdl')], + stderr=log_file) + est_proc.communicate() + if not self.debug: + for f in acc_files: + os.remove(f) + parse_logs(self.log_directory) + if self.debug: + self.logger.info('Initializing alignment improvement calculations') + compute_alignment_improvement(0, self, self.train_directory, self.corpus.num_jobs) + + except Exception as e: + with open(dirty_path, 'w'): + pass + if isinstance(e, KaldiProcessingError): + log_kaldi_errors(e.error_logs, self.logger) + raise + self.logger.info('Initialization complete!') + self.logger.debug('Initialization took {} seconds'.format(time.time() - begin)) diff --git a/montreal_forced_aligner/trainers/sat.py b/montreal_forced_aligner/trainers/sat.py index 24ad46c4..3b298de2 100644 --- a/montreal_forced_aligner/trainers/sat.py +++ b/montreal_forced_aligner/trainers/sat.py @@ -2,11 +2,13 @@ from tqdm import tqdm import subprocess import shutil +import time from ..multiprocessing import (align, compile_train_graphs, acc_stats, tree_stats, convert_alignments, calc_fmllr, compute_alignment_improvement) -from ..helper import thirdparty_binary, make_path_safe +from ..helper import thirdparty_binary, make_path_safe, log_kaldi_errors, parse_logs +from ..exceptions import KaldiProcessingError from .triphone import TriphoneTrainer @@ -52,143 +54,235 @@ def train_type(self): return 'sat' def train(self, call_back=None): - final_mdl_path = os.path.join(self.train_directory, 'final.mdl') - if os.path.exists(final_mdl_path): - print('{} training already done, skipping.'.format(self.identifier)) + done_path = os.path.join(self.train_directory, 'done') + dirty_path = os.path.join(self.train_directory, 'dirty') + if os.path.exists(done_path): + self.logger.info('{} training already done, skipping initialization.'.format(self.identifier)) return + begin = time.time() num_gauss = self.initial_gaussians if call_back == print: iters = tqdm(range(1, self.num_iterations)) else: iters = range(1, self.num_iterations) sil_phones = self.dictionary.silence_csl - for i in iters: - model_path = os.path.join(self.train_directory, '{}.mdl'.format(i)) - occs_path = os.path.join(self.train_directory, '{}.occs'.format(i + 1)) - next_model_path = os.path.join(self.train_directory, '{}.mdl'.format(i + 1)) - if os.path.exists(next_model_path): - continue - if i in self.realignment_iterations: - align(i, self.train_directory, self.data_directory, - self.dictionary.optional_silence_csl, - self.corpus.num_jobs, self) - if self.debug: - compute_alignment_improvement(i, self, self.train_directory, self.corpus.num_jobs) - if i in self.fmllr_iterations: - calc_fmllr(self.train_directory, self.data_directory, sil_phones, - self.corpus.num_jobs, self, initial=False, iteration=i) - - acc_stats(i, self.train_directory, self.data_directory, self.corpus.num_jobs, self) - log_path = os.path.join(self.log_directory, 'update.{}.log'.format(i)) - with open(log_path, 'w') as logf: - acc_files = [os.path.join(self.train_directory, '{}.{}.acc'.format(i, x)) - for x in range(self.corpus.num_jobs)] - est_proc = subprocess.Popen([thirdparty_binary('gmm-est'), - '--write-occs=' + occs_path, - '--mix-up=' + str(num_gauss), '--power=' + str(self.power), - model_path, - "{} - {}|".format(thirdparty_binary('gmm-sum-accs'), - ' '.join(map(make_path_safe, acc_files))), - next_model_path], - stderr=logf) - est_proc.communicate() - if not os.path.exists(next_model_path): - raise(Exception('There was an error training in iteration {}, please check the logs.'.format(i))) - if not self.debug: - for f in acc_files: - os.remove(f) - self.parse_log_directory(self.log_directory, i, self.corpus.num_jobs, call_back) - if i < self.final_gaussian_iteration: - num_gauss += self.gaussian_increment - shutil.copy(os.path.join(self.train_directory, '{}.mdl'.format(self.num_iterations)), - os.path.join(self.train_directory, 'final.mdl')) - shutil.copy(os.path.join(self.train_directory, '{}.occs'.format(self.num_iterations)), - os.path.join(self.train_directory, 'final.occs')) - if not self.debug: - for i in range(1, self.num_iterations): + try: + for i in iters: model_path = os.path.join(self.train_directory, '{}.mdl'.format(i)) - try: - os.remove(model_path) - except FileNotFoundError: - pass - try: - os.remove(os.path.join(self.train_directory, '{}.occs'.format(i))) - except FileNotFoundError: + occs_path = os.path.join(self.train_directory, '{}.occs'.format(i + 1)) + next_model_path = os.path.join(self.train_directory, '{}.mdl'.format(i + 1)) + if os.path.exists(next_model_path): + continue + if i in self.realignment_iterations: + align(i, self.train_directory, self.data_directory, + self.dictionary.optional_silence_csl, + self.corpus.num_jobs, self) + if self.debug: + compute_alignment_improvement(i, self, self.train_directory, self.corpus.num_jobs) + if i in self.fmllr_iterations: + calc_fmllr(self.train_directory, self.data_directory, sil_phones, + self.corpus.num_jobs, self, initial=False, iteration=i) + + acc_stats(i, self.train_directory, self.data_directory, self.corpus.num_jobs, self) + log_path = os.path.join(self.log_directory, 'update.{}.log'.format(i)) + with open(log_path, 'w') as log_file: + acc_files = [os.path.join(self.train_directory, '{}.{}.acc'.format(i, x)) + for x in range(self.corpus.num_jobs)] + est_proc = subprocess.Popen([thirdparty_binary('gmm-est'), + '--write-occs=' + occs_path, + '--mix-up=' + str(num_gauss), '--power=' + str(self.power), + model_path, + "{} - {}|".format(thirdparty_binary('gmm-sum-accs'), + ' '.join(map(make_path_safe, acc_files))), + next_model_path], + stderr=log_file) + est_proc.communicate() + parse_logs(self.log_directory) + if not os.path.exists(next_model_path): + raise(Exception('There was an error training in iteration {}, please check the logs.'.format(i))) + if not self.debug: + for f in acc_files: + os.remove(f) + self.parse_log_directory(self.log_directory, i, self.corpus.num_jobs, call_back) + if i < self.final_gaussian_iteration: + num_gauss += self.gaussian_increment + shutil.copy(os.path.join(self.train_directory, '{}.mdl'.format(self.num_iterations)), + os.path.join(self.train_directory, 'final.mdl')) + shutil.copy(os.path.join(self.train_directory, '{}.occs'.format(self.num_iterations)), + os.path.join(self.train_directory, 'final.occs')) + if not self.debug: + for i in range(1, self.num_iterations): + model_path = os.path.join(self.train_directory, '{}.mdl'.format(i)) + try: + os.remove(model_path) + except FileNotFoundError: + pass + try: + os.remove(os.path.join(self.train_directory, '{}.occs'.format(i))) + except FileNotFoundError: + pass + except Exception as e: + with open(dirty_path, 'w'): + pass + if isinstance(e, KaldiProcessingError): + log_kaldi_errors(e.error_logs, self.logger) + raise + with open(done_path, 'w'): + pass + self.logger.info('Training complete!') + self.logger.debug('Training took {} seconds'.format(time.time() - begin)) + + def align(self, subset, call_back=None): + dirty_path = os.path.join(self.align_directory, 'dirty') + if os.path.exists(dirty_path): # if there was an error, let's redo from scratch + shutil.rmtree(self.align_directory) + done_path = os.path.join(self.align_directory, 'done') + if not os.path.exists(done_path): + message = 'Generating alignments using {} models'.format(self.identifier) + if subset: + message += ' using {} utterances...'.format(subset) + else: + message += ' for the whole corpus...' + self.logger.info(message) + begin = time.time() + self.logger.debug('Using {} as the feature name'.format(self.feature_file_base_name)) + if subset is None: + align_data_directory = self.corpus.split_directory() + else: + align_data_directory = self.corpus.subset_directory(subset, self.feature_config) + try: + log_dir = os.path.join(self.align_directory, 'log') + os.makedirs(log_dir, exist_ok=True) + shutil.copy(os.path.join(self.train_directory, 'tree'), self.align_directory) + shutil.copyfile(os.path.join(self.train_directory, 'final.mdl'), + os.path.join(self.align_directory, 'final.mdl')) + + if os.path.exists(os.path.join(self.train_directory, 'lda.mat')): + shutil.copyfile(os.path.join(self.train_directory, 'lda.mat'), + os.path.join(self.align_directory, 'lda.mat')) + shutil.copyfile(os.path.join(self.train_directory, 'final.occs'), + os.path.join(self.align_directory, 'final.occs')) + compile_train_graphs(self.align_directory, self.dictionary.output_directory, + align_data_directory, self.corpus.num_jobs, self) + if align_data_directory == self.data_directory and os.path.exists(os.path.join(self.train_directory, 'trans.0')): + for i in range(self.corpus.num_jobs): + shutil.copy(os.path.join(self.train_directory, 'trans.{}'.format(i)), + os.path.join(self.align_directory, 'trans.{}'.format(i))) + align('final', self.align_directory, align_data_directory, + self.dictionary.optional_silence_csl, + self.corpus.num_jobs, self, self.align_directory) + + if not os.path.exists(os.path.join(self.align_directory, 'trans.0')): + calc_fmllr(self.align_directory, align_data_directory, + self.dictionary.optional_silence_csl, self.corpus.num_jobs, self, initial=True, iteration='final') + align('final', self.align_directory, align_data_directory, + self.dictionary.optional_silence_csl, + self.corpus.num_jobs, self, self.align_directory) + self.save(os.path.join(self.align_directory, 'acoustic_model.zip')) + except Exception as e: + with open(dirty_path, 'w'): pass + if isinstance(e, KaldiProcessingError): + log_kaldi_errors(e.error_logs, self.logger) + raise + with open(done_path, 'w'): + pass + self.logger.debug('Alignment took {} seconds'.format(time.time() - begin)) + else: + self.logger.info('Alignments using {} models already done'.format(self.identifier)) + if self.debug: + self.export_textgrids() def init_training(self, identifier, temporary_directory, corpus, dictionary, previous_trainer): + self.feature_config.fmllr = False self._setup_for_init(identifier, temporary_directory, corpus, dictionary) - + done_path = os.path.join(self.train_directory, 'done') + dirty_path = os.path.join(self.train_directory, 'dirty') + if os.path.exists(done_path): + self.logger.info('{} training already done, skipping initialization.'.format(self.identifier)) + return + begin = time.time() if os.path.exists(os.path.join(self.train_directory, '1.mdl')): return self.feature_config.fmllr = True - print('Initializing speaker-adapted triphone training...') + self.logger.info('Initializing speaker-adapted triphone training...') align_directory = previous_trainer.align_directory context_opts = [] ci_phones = self.dictionary.silence_csl + try: + if os.path.exists(os.path.join(align_directory, 'lda.mat')): + shutil.copyfile(os.path.join(align_directory, 'lda.mat'), os.path.join(self.train_directory, 'lda.mat')) + tree_stats(self.train_directory, align_directory, + self.data_directory, ci_phones, self.corpus.num_jobs, self) + log_path = os.path.join(self.log_directory, 'questions.log') + tree_path = os.path.join(self.train_directory, 'tree') + treeacc_path = os.path.join(self.train_directory, 'treeacc') + sets_int_path = os.path.join(self.dictionary.phones_dir, 'sets.int') + roots_int_path = os.path.join(self.dictionary.phones_dir, 'roots.int') + extra_question_int_path = os.path.join(self.dictionary.phones_dir, 'extra_questions.int') + topo_path = os.path.join(self.dictionary.output_directory, 'topo') + questions_path = os.path.join(self.train_directory, 'questions.int') + questions_qst_path = os.path.join(self.train_directory, 'questions.qst') + with open(log_path, 'w') as log_file: + subprocess.call([thirdparty_binary('cluster-phones')] + context_opts + + [treeacc_path, sets_int_path, questions_path], stderr=log_file) + + with open(extra_question_int_path, 'r') as in_file, \ + open(questions_path, 'a') as out_file: + for line in in_file: + out_file.write(line) + + log_path = os.path.join(self.log_directory, 'compile_questions.log') + with open(log_path, 'w') as log_file: + subprocess.call([thirdparty_binary('compile-questions')] + context_opts + + [topo_path, questions_path, questions_qst_path], + stderr=log_file) + + log_path = os.path.join(self.log_directory, 'build_tree.log') + with open(log_path, 'w') as log_file: + subprocess.call([thirdparty_binary('build-tree')] + context_opts + + ['--verbose=1', '--max-leaves={}'.format(self.initial_gaussians), + '--cluster-thresh={}'.format(self.cluster_threshold), + treeacc_path, roots_int_path, questions_qst_path, + topo_path, tree_path], stderr=log_file) + + log_path = os.path.join(self.log_directory, 'init_model.log') + occs_path = os.path.join(self.train_directory, '0.occs') + mdl_path = os.path.join(self.train_directory, '0.mdl') + with open(log_path, 'w') as log_file: + subprocess.call([thirdparty_binary('gmm-init-model'), + '--write-occs=' + occs_path, tree_path, treeacc_path, + topo_path, mdl_path], stderr=log_file) + + log_path = os.path.join(self.log_directory, 'mixup.log') + with open(log_path, 'w') as log_file: + subprocess.call([thirdparty_binary('gmm-mixup'), + '--mix-up={}'.format(self.initial_gaussians), + mdl_path, occs_path, mdl_path], stderr=log_file) + os.remove(treeacc_path) + + compile_train_graphs(self.train_directory, self.dictionary.output_directory, + self.data_directory, self.corpus.num_jobs, self) + os.rename(occs_path, os.path.join(self.train_directory, '1.occs')) + os.rename(mdl_path, os.path.join(self.train_directory, '1.mdl')) + + convert_alignments(self.train_directory, align_directory, self.corpus.num_jobs, self) + + if os.path.exists(os.path.join(align_directory, 'trans.0')): + for i in range(self.corpus.num_jobs): + shutil.copy(os.path.join(align_directory, 'trans.{}'.format(i)), + os.path.join(self.train_directory, 'trans.{}'.format(i))) + else: - tree_stats(self.train_directory, align_directory, - self.data_directory, ci_phones, self.corpus.num_jobs, self) - log_path = os.path.join(self.log_directory, 'questions.log') - tree_path = os.path.join(self.train_directory, 'tree') - treeacc_path = os.path.join(self.train_directory, 'treeacc') - sets_int_path = os.path.join(self.dictionary.phones_dir, 'sets.int') - roots_int_path = os.path.join(self.dictionary.phones_dir, 'roots.int') - extra_question_int_path = os.path.join(self.dictionary.phones_dir, 'extra_questions.int') - topo_path = os.path.join(self.dictionary.output_directory, 'topo') - questions_path = os.path.join(self.train_directory, 'questions.int') - questions_qst_path = os.path.join(self.train_directory, 'questions.qst') - with open(log_path, 'w') as logf: - subprocess.call([thirdparty_binary('cluster-phones')] + context_opts + - [treeacc_path, sets_int_path, questions_path], stderr=logf) - - with open(extra_question_int_path, 'r') as inf, \ - open(questions_path, 'a') as outf: - for line in inf: - outf.write(line) - - log_path = os.path.join(self.log_directory, 'compile_questions.log') - with open(log_path, 'w') as logf: - subprocess.call([thirdparty_binary('compile-questions')] + context_opts + - [topo_path, questions_path, questions_qst_path], - stderr=logf) - - log_path = os.path.join(self.log_directory, 'build_tree.log') - with open(log_path, 'w') as logf: - subprocess.call([thirdparty_binary('build-tree')] + context_opts + - ['--verbose=1', '--max-leaves={}'.format(self.initial_gaussians), - '--cluster-thresh={}'.format(self.cluster_threshold), - treeacc_path, roots_int_path, questions_qst_path, - topo_path, tree_path], stderr=logf) - - log_path = os.path.join(self.log_directory, 'init_model.log') - occs_path = os.path.join(self.train_directory, '0.occs') - mdl_path = os.path.join(self.train_directory, '0.mdl') - with open(log_path, 'w') as logf: - subprocess.call([thirdparty_binary('gmm-init-model'), - '--write-occs=' + occs_path, tree_path, treeacc_path, - topo_path, mdl_path], stderr=logf) - - log_path = os.path.join(self.log_directory, 'mixup.log') - with open(log_path, 'w') as logf: - subprocess.call([thirdparty_binary('gmm-mixup'), - '--mix-up={}'.format(self.initial_gaussians), - mdl_path, occs_path, mdl_path], stderr=logf) - os.remove(treeacc_path) - - compile_train_graphs(self.train_directory, self.dictionary.output_directory, - self.data_directory, self.corpus.num_jobs, self) - os.rename(occs_path, os.path.join(self.train_directory, '1.occs')) - os.rename(mdl_path, os.path.join(self.train_directory, '1.mdl')) - - convert_alignments(self.train_directory, align_directory, self.corpus.num_jobs, self) - - calc_fmllr(self.train_directory, self.data_directory, - self.dictionary.silence_csl, self.corpus.num_jobs, self, initial=True) - - if os.path.exists(os.path.join(align_directory, 'trans.0')): - for i in range(self.corpus.num_jobs): - shutil.copy(os.path.join(align_directory, 'trans.{}'.format(i)), - os.path.join(self.train_directory, 'trans.{}'.format(i))) - print('Initialization complete!') + calc_fmllr(self.train_directory, self.data_directory, + self.dictionary.silence_csl, self.corpus.num_jobs, self, initial=True) + parse_logs(self.log_directory) + except Exception as e: + with open(dirty_path, 'w'): + pass + if isinstance(e, KaldiProcessingError): + log_kaldi_errors(e.error_logs, self.logger) + raise + self.logger.info('Initialization complete!') + self.logger.debug('Initialization took {} seconds'.format(time.time() - begin)) diff --git a/montreal_forced_aligner/trainers/triphone.py b/montreal_forced_aligner/trainers/triphone.py index 7eea7409..4299c7eb 100644 --- a/montreal_forced_aligner/trainers/triphone.py +++ b/montreal_forced_aligner/trainers/triphone.py @@ -1,8 +1,9 @@ import os import subprocess - +import time from .base import BaseTrainer -from ..helper import thirdparty_binary +from ..helper import thirdparty_binary, log_kaldi_errors, parse_logs +from ..exceptions import KaldiProcessingError from ..multiprocessing import compile_train_graphs, tree_stats, convert_alignments @@ -49,69 +50,84 @@ def train_type(self): def phone_type(self): return 'triphone' + def _setup_tree(self, align_directory): + dirty_path = os.path.join(self.train_directory, 'dirty') + try: + context_opts = [] + ci_phones = self.dictionary.silence_csl + + tree_stats(self.train_directory, align_directory, + self.data_directory, ci_phones, self.corpus.num_jobs, self) + log_path = os.path.join(self.log_directory, 'questions.log') + tree_path = os.path.join(self.train_directory, 'tree') + treeacc_path = os.path.join(self.train_directory, 'treeacc') + sets_int_path = os.path.join(self.dictionary.phones_dir, 'sets.int') + roots_int_path = os.path.join(self.dictionary.phones_dir, 'roots.int') + extra_question_int_path = os.path.join(self.dictionary.phones_dir, 'extra_questions.int') + topo_path = os.path.join(self.dictionary.output_directory, 'topo') + questions_path = os.path.join(self.train_directory, 'questions.int') + questions_qst_path = os.path.join(self.train_directory, 'questions.qst') + with open(log_path, 'w') as log_file: + subprocess.call([thirdparty_binary('cluster-phones')] + context_opts + + [treeacc_path, sets_int_path, questions_path], stderr=log_file) + + with open(extra_question_int_path, 'r') as inf, \ + open(questions_path, 'a') as outf: + for line in inf: + outf.write(line) + + log_path = os.path.join(self.log_directory, 'compile_questions.log') + with open(log_path, 'w') as log_file: + subprocess.call([thirdparty_binary('compile-questions')] + context_opts + + [topo_path, questions_path, questions_qst_path], + stderr=log_file) + + log_path = os.path.join(self.log_directory, 'build_tree.log') + with open(log_path, 'w') as log_file: + subprocess.call([thirdparty_binary('build-tree')] + context_opts + + ['--verbose=1', '--max-leaves={}'.format(self.initial_gaussians), + '--cluster-thresh={}'.format(self.cluster_threshold), + treeacc_path, roots_int_path, questions_qst_path, + topo_path, tree_path], stderr=log_file) + + log_path = os.path.join(self.log_directory, 'init_model.log') + occs_path = os.path.join(self.train_directory, '0.occs') + mdl_path = os.path.join(self.train_directory, '0.mdl') + with open(log_path, 'w') as log_file: + subprocess.call([thirdparty_binary('gmm-init-model'), + '--write-occs=' + occs_path, tree_path, treeacc_path, + topo_path, mdl_path], stderr=log_file) + + log_path = os.path.join(self.log_directory, 'mixup.log') + with open(log_path, 'w') as log_file: + subprocess.call([thirdparty_binary('gmm-mixup'), + '--mix-up={}'.format(self.initial_gaussians), + mdl_path, occs_path, mdl_path], stderr=log_file) + #os.remove(treeacc_path) + parse_logs(self.log_directory) + + compile_train_graphs(self.train_directory, self.dictionary.output_directory, + self.data_directory, self.corpus.num_jobs, self) + os.rename(occs_path, os.path.join(self.train_directory, '1.occs')) + os.rename(mdl_path, os.path.join(self.train_directory, '1.mdl')) + + convert_alignments(self.train_directory, align_directory, self.corpus.num_jobs, self) + except Exception as e: + with open(dirty_path, 'w'): + pass + if isinstance(e, KaldiProcessingError): + log_kaldi_errors(e.error_logs, self.logger) + raise + def init_training(self, identifier, temporary_directory, corpus, dictionary, previous_trainer): self._setup_for_init(identifier, temporary_directory, corpus, dictionary) - - if os.path.exists(os.path.join(self.train_directory, 'final.mdl')): + done_path = os.path.join(self.train_directory, 'done') + if os.path.exists(done_path): + self.logger.info('{} training already done, skipping initialization.'.format(self.identifier)) return - + begin = time.time() align_directory = previous_trainer.align_directory - context_opts = [] - ci_phones = self.dictionary.silence_csl - - tree_stats(self.train_directory, align_directory, - self.data_directory, ci_phones, self.corpus.num_jobs, self) - log_path = os.path.join(self.log_directory, 'questions.log') - tree_path = os.path.join(self.train_directory, 'tree') - treeacc_path = os.path.join(self.train_directory, 'treeacc') - sets_int_path = os.path.join(self.dictionary.phones_dir, 'sets.int') - roots_int_path = os.path.join(self.dictionary.phones_dir, 'roots.int') - extra_question_int_path = os.path.join(self.dictionary.phones_dir, 'extra_questions.int') - topo_path = os.path.join(self.dictionary.output_directory, 'topo') - questions_path = os.path.join(self.train_directory, 'questions.int') - questions_qst_path = os.path.join(self.train_directory, 'questions.qst') - with open(log_path, 'w') as log_file: - subprocess.call([thirdparty_binary('cluster-phones')] + context_opts + - [treeacc_path, sets_int_path, questions_path], stderr=log_file) - - with open(extra_question_int_path, 'r') as inf, \ - open(questions_path, 'a') as outf: - for line in inf: - outf.write(line) - - log_path = os.path.join(self.log_directory, 'compile_questions.log') - with open(log_path, 'w') as log_file: - subprocess.call([thirdparty_binary('compile-questions')] + context_opts + - [topo_path, questions_path, questions_qst_path], - stderr=log_file) - - log_path = os.path.join(self.log_directory, 'build_tree.log') - with open(log_path, 'w') as log_file: - subprocess.call([thirdparty_binary('build-tree')] + context_opts + - ['--verbose=1', '--max-leaves={}'.format(self.initial_gaussians), - '--cluster-thresh={}'.format(self.cluster_threshold), - treeacc_path, roots_int_path, questions_qst_path, - topo_path, tree_path], stderr=log_file) - - log_path = os.path.join(self.log_directory, 'init_model.log') - occs_path = os.path.join(self.train_directory, '0.occs') - mdl_path = os.path.join(self.train_directory, '0.mdl') - with open(log_path, 'w') as log_file: - subprocess.call([thirdparty_binary('gmm-init-model'), - '--write-occs=' + occs_path, tree_path, treeacc_path, - topo_path, mdl_path], stderr=log_file) - - log_path = os.path.join(self.log_directory, 'mixup.log') - with open(log_path, 'w') as log_file: - subprocess.call([thirdparty_binary('gmm-mixup'), - '--mix-up={}'.format(self.initial_gaussians), - mdl_path, occs_path, mdl_path], stderr=log_file) - os.remove(treeacc_path) - - compile_train_graphs(self.train_directory, self.dictionary.output_directory, - self.data_directory, self.corpus.num_jobs, self) - os.rename(occs_path, os.path.join(self.train_directory, '1.occs')) - os.rename(mdl_path, os.path.join(self.train_directory, '1.mdl')) - - convert_alignments(self.train_directory, align_directory, self.corpus.num_jobs, self) - print('Initialization complete!') + self._setup_tree(align_directory) + + self.logger.info('Initialization complete!') + self.logger.debug('Initialization took {} seconds'.format(time.time() - begin)) diff --git a/montreal_forced_aligner/transcriber.py b/montreal_forced_aligner/transcriber.py index 58e9d116..881e134a 100644 --- a/montreal_forced_aligner/transcriber.py +++ b/montreal_forced_aligner/transcriber.py @@ -79,15 +79,17 @@ def setup(self): ha_path = os.path.join(self.transcribe_directory, 'Ha.fst') hclga_path = os.path.join(self.transcribe_directory, 'HCLGa.fst') hclg_path = os.path.join(self.transcribe_directory, 'HCLG.fst') + words_path = os.path.join(self.transcribe_directory, 'words.txt') shutil.copyfile(self.dictionary.words_symbol_path, os.path.join(self.transcribe_directory, 'words.txt')) if os.path.exists(hclg_path): return print('Generating decoding graph...') + with open(log_path, 'w') as log_file: if not os.path.exists(g_path): print('Generating G.fst...') arpafst_proc = subprocess.Popen([thirdparty_binary('arpa2fst'), '--disambig-symbol=#0', - '--read-symbol-table=' + self.dictionary.words_symbol_path, + '--read-symbol-table=' + words_path, self.language_model.arpa_path, g_path], stderr=log_file, stdout=log_file) arpafst_proc.communicate() print('Done!') diff --git a/montreal_forced_aligner/validator.py b/montreal_forced_aligner/validator.py index 1f0355af..52f8ab24 100644 --- a/montreal_forced_aligner/validator.py +++ b/montreal_forced_aligner/validator.py @@ -2,7 +2,8 @@ from decimal import Decimal import subprocess -from .helper import thirdparty_binary, load_scp, edit_distance +from .helper import thirdparty_binary, load_scp, edit_distance, log_kaldi_errors +from .exceptions import KaldiProcessingError from .multiprocessing import run_mp, run_non_mp from .trainers import MonophoneTrainer @@ -26,6 +27,9 @@ def test_utterances_func(validator, job_name): log_path = os.path.join(aligner.align_directory, 'log', 'decode.0.{}.log'.format(job_name)) words_path = os.path.join(validator.dictionary.output_directory, 'words.txt') mdl_path = os.path.join(aligner.align_directory, 'final.mdl') + + split_directory = validator.corpus.split_directory() + feat_string = aligner.feature_config.construct_feature_proc_string(split_directory, aligner.align_directory, job_name) feat_path = os.path.join(validator.corpus.split_directory(), aligner.feature_file_base_name + '.{}.scp'.format(job_name)) graphs_path = os.path.join(aligner.align_directory, 'utterance_graphs.{}.fst'.format(job_name)) @@ -44,7 +48,7 @@ def test_utterances_func(validator, job_name): '--beam={}'.format(beam), '--max-active={}'.format(max_active), '--lattice-beam={}'.format(lattice_beam), '--word-symbol-table=' + words_path, - mdl_path, 'ark:' + graphs_path, 'scp:' + feat_path, 'ark:' + lat_path], + mdl_path, 'ark:' + graphs_path, feat_string, 'ark:' + lat_path], stderr=logf) latgen_proc.communicate() @@ -83,7 +87,7 @@ def compile_utterance_train_graphs_func(validator, job_name): # pragma: no cove '--read-disambig-syms={}'.format(disambig_int_path), tree_path, mdl_path, lexicon_fst_path, - "ark:"+fsts_path, "ark:" + graphs_path], + "ark:" + fsts_path, "ark:" + graphs_path], stderr=logf) proc.communicate() @@ -162,17 +166,20 @@ class CorpusValidator(object): ''' def __init__(self, corpus, dictionary, temp_directory=None, ignore_acoustics=False, test_transcriptions=False, - use_mp=True): + use_mp=True, logger=None): self.dictionary = dictionary self.corpus = corpus self.temp_directory = temp_directory self.test_transcriptions = test_transcriptions self.ignore_acoustics = ignore_acoustics self.trainer = MonophoneTrainer(FeatureConfig()) + self.logger = logger + self.trainer.logger = logger self.trainer.update({"use_mp": use_mp}) self.setup() def setup(self): + self.dictionary.set_word_set(self.corpus.word_set) self.dictionary.write() if self.test_transcriptions: self.dictionary.write(disambig=True) @@ -190,22 +197,22 @@ def analyze_setup(self): ignored_count = len(self.corpus.no_transcription_files) ignored_count += len(self.corpus.textgrid_read_errors) ignored_count += len(self.corpus.decode_error_files) - print(self.corpus_analysis_template.format(len(self.corpus.wav_files), - self.corpus.lab_count, - self.corpus.tg_count, - ignored_count, - len(self.corpus.speak_utt_mapping), - self.corpus.num_utterances, - total_duration, - self.analyze_oovs(), - self.analyze_wav_errors(), - self.analyze_missing_features(), - self.analyze_files_with_no_transcription(), - self.analyze_transcriptions_with_no_wavs(), - self.analyze_textgrid_read_errors(), - self.analyze_unreadable_text_files(), - self.analyze_unsupported_sample_rates() - )) + self.logger.info(self.corpus_analysis_template.format(len(self.corpus.wav_files), + self.corpus.lab_count, + self.corpus.tg_count, + ignored_count, + len(self.corpus.speak_utt_mapping), + self.corpus.num_utterances, + total_duration, + self.analyze_oovs(), + self.analyze_wav_errors(), + self.analyze_missing_features(), + self.analyze_files_with_no_transcription(), + self.analyze_transcriptions_with_no_wavs(), + self.analyze_textgrid_read_errors(), + self.analyze_unreadable_text_files(), + self.analyze_unsupported_sample_rates() + )) def analyze_oovs(self): output_dir = self.corpus.output_directory @@ -391,55 +398,63 @@ def validate(self): self.test_utterance_transcriptions() def test_utterance_transcriptions(self): - print('Checking utterance transcriptions...') + self.logger.info('Checking utterance transcriptions...') split_directory = self.corpus.split_directory() model_directory = self.trainer.align_directory - - jobs = [(self, x) - for x in range(self.corpus.num_jobs)] - if self.trainer.feature_config.use_mp: - run_mp(compile_utterance_train_graphs_func, jobs) - else: - run_non_mp(compile_utterance_train_graphs_func, jobs) - print('Utterance FSTs compiled!') - print('Decoding utterances (this will take some time)...') - if self.trainer.feature_config.use_mp: - run_mp(test_utterances_func, jobs) - else: - run_non_mp(test_utterances_func, jobs) - print('Finished decoding utterances!') - - word_mapping = self.dictionary.reversed_word_mapping - errors = {} - - for job in range(self.corpus.num_jobs): - text_path = os.path.join(split_directory, 'text.{}'.format(job)) - texts = load_scp(text_path) - aligned_int = load_scp(os.path.join(model_directory, 'aligned.{}.int'.format(job))) - with open(os.path.join(model_directory, 'aligned.{}'.format(job)), 'w') as outf: - for utt, line in sorted(aligned_int.items()): - text = [] - for t in line: - text.append(word_mapping[int(t)]) - outf.write('{} {}\n'.format(utt, ' '.join(text))) - ref_text = texts[utt] - edits = edit_distance(text, ref_text) - - if edits: - errors[utt] = (edits, ref_text, text) - if not errors: - message = 'There were no utterances with transcription issues.' - else: - out_path = os.path.join(self.corpus.output_directory, 'transcription_problems.csv') - with open(out_path, 'w') as problemf: - problemf.write('Utterance,Edits,Reference,Decoded\n') - for utt, (edits, ref_text, text) in sorted(errors.items(), - key=lambda x: -1 * ( - len(x[1][1]) + len(x[1][2]))): - problemf.write('{},{},{},{}\n'.format(utt, edits, - ' '.join(ref_text), ' '.join(text))) - message = 'There were {} of {} utterances with at least one transcription issue. ' \ - 'Please see the outputted csv file {}.'.format(len(errors), self.corpus.num_utterances, out_path) - - print(self.transcription_analysis_template.format(message)) + log_directory = os.path.join(model_directory, 'log') + + try: + + jobs = [(self, x) + for x in range(self.corpus.num_jobs)] + if self.trainer.feature_config.use_mp: + run_mp(compile_utterance_train_graphs_func, jobs, log_directory) + else: + run_non_mp(compile_utterance_train_graphs_func, jobs, log_directory) + self.logger.info('Utterance FSTs compiled!') + self.logger.info('Decoding utterances (this will take some time)...') + if self.trainer.feature_config.use_mp: + run_mp(test_utterances_func, jobs, log_directory) + else: + run_non_mp(test_utterances_func, jobs, log_directory) + self.logger.info('Finished decoding utterances!') + + word_mapping = self.dictionary.reversed_word_mapping + errors = {} + + for job in range(self.corpus.num_jobs): + text_path = os.path.join(split_directory, 'text.{}'.format(job)) + texts = load_scp(text_path) + aligned_int = load_scp(os.path.join(model_directory, 'aligned.{}.int'.format(job))) + with open(os.path.join(model_directory, 'aligned.{}'.format(job)), 'w') as outf: + for utt, line in sorted(aligned_int.items()): + text = [] + for t in line: + text.append(word_mapping[int(t)]) + outf.write('{} {}\n'.format(utt, ' '.join(text))) + ref_text = texts[utt] + edits = edit_distance(text, ref_text) + + if edits: + errors[utt] = (edits, ref_text, text) + if not errors: + message = 'There were no utterances with transcription issues.' + else: + out_path = os.path.join(self.corpus.output_directory, 'transcription_problems.csv') + with open(out_path, 'w') as problemf: + problemf.write('Utterance,Edits,Reference,Decoded\n') + for utt, (edits, ref_text, text) in sorted(errors.items(), + key=lambda x: -1 * ( + len(x[1][1]) + len(x[1][2]))): + problemf.write('{},{},{},{}\n'.format(utt, edits, + ' '.join(ref_text), ' '.join(text))) + message = 'There were {} of {} utterances with at least one transcription issue. ' \ + 'Please see the outputted csv file {}.'.format(len(errors), self.corpus.num_utterances, out_path) + + self.logger.info(self.transcription_analysis_template.format(message)) + + except Exception as e: + if isinstance(e, KaldiProcessingError): + log_kaldi_errors(e.error_logs, self.logger) + raise diff --git a/requirements.txt b/requirements.txt index f45f516d..e91f6f0b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,4 +5,6 @@ pyyaml librosa pyqt5 pyqtgraph -requests \ No newline at end of file +requests +sklearn +joblib \ No newline at end of file diff --git a/setup.py b/setup.py index a681fa08..399a96e2 100644 --- a/setup.py +++ b/setup.py @@ -43,7 +43,7 @@ def run_tests(self): if __name__ == '__main__': setup(name='Montreal Forced Aligner', - version='2.0.0a2', + version='2.0.0a3', description='Montreal Forced Aligner is a package for aligning speech corpora through the use of ' 'acoustic models and dictionaries using Kaldi functionality.', long_description=readme(), @@ -68,6 +68,7 @@ def run_tests(self): 'montreal_forced_aligner.g2p', 'montreal_forced_aligner.gui', 'montreal_forced_aligner.lm', + 'montreal_forced_aligner.multiprocessing', 'montreal_forced_aligner.thirdparty', 'montreal_forced_aligner.trainers'], install_requires=[ @@ -79,6 +80,8 @@ def run_tests(self): 'pyqt5', 'pyqtgraph', 'requests', + 'sklearn', + 'joblib' ], python_requires='>=3.8', entry_points={ diff --git a/tests/conftest.py b/tests/conftest.py index 7402a471..4292354c 100755 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -6,7 +6,7 @@ import shutil import pytest -from montreal_forced_aligner.corpus import AlignableCorpus +from montreal_forced_aligner.corpus import AlignableCorpus, TranscribeCorpus from montreal_forced_aligner.dictionary import Dictionary from montreal_forced_aligner.config import train_yaml_to_config, align_yaml_to_config @@ -69,14 +69,20 @@ def english_acoustic_model(): download_model('acoustic', 'english') +@pytest.fixture(scope='session') +def english_ivector_model(): + from montreal_forced_aligner.command_line.download import download_model + download_model('ivector', 'english_ivector') + + @pytest.fixture(scope='session') def transcription_acoustic_model(acoustic_model_dir): return os.path.join(acoustic_model_dir, 'mono_model.zip') @pytest.fixture(scope='session') -def transcription_language_model(language_model_dir): - return os.path.join(language_model_dir, 'basic_lm.arpa') +def transcription_language_model(language_model_dir, generated_dir): + return os.path.join(language_model_dir, 'basic_lm.zip') @pytest.fixture(scope='session') @@ -319,6 +325,13 @@ def sick_corpus(basic_corpus_dir, generated_dir): return corpus +@pytest.fixture(scope='session') +def sick_corpus_transcribe(basic_corpus_dir, generated_dir): + output_directory = os.path.join(generated_dir, 'sickcorpus_transcribe') + corpus = TranscribeCorpus(basic_corpus_dir, output_directory, num_jobs=2) + return corpus + + @pytest.fixture(scope='session') def textgrid_directory(test_dir): return os.path.join(test_dir, 'textgrid') @@ -439,6 +452,11 @@ def basic_train_config(config_directory): return os.path.join(config_directory, 'basic_train_config.yaml') +@pytest.fixture(scope='session') +def transcribe_config(config_directory): + return os.path.join(config_directory, 'transcribe.yaml') + + @pytest.fixture(scope='session') def basic_train_lm_config(config_directory): return os.path.join(config_directory, 'basic_train_lm.yaml') @@ -450,8 +468,13 @@ def basic_align_config(config_directory): @pytest.fixture(scope='session') -def basic_train_ivector_config(config_directory): - return os.path.join(config_directory, 'basic_train_ivector.yaml') +def basic_segment_config(config_directory): + return os.path.join(config_directory, 'basic_segment_config.yaml') + + +@pytest.fixture(scope='session') +def train_ivector_config(config_directory): + return os.path.join(config_directory, 'ivector_train.yaml') @pytest.fixture(scope='session') diff --git a/tests/data/configs/basic_segment_config.yaml b/tests/data/configs/basic_segment_config.yaml new file mode 100644 index 00000000..f1855928 --- /dev/null +++ b/tests/data/configs/basic_segment_config.yaml @@ -0,0 +1,6 @@ +use_mp: false + +energy_threshold: 9 +energy_mean_scale: 0.5 +max_segment_length: 30 +min_pause_duration: 0.25 \ No newline at end of file diff --git a/tests/data/configs/basic_train_ivector.yaml b/tests/data/configs/basic_train_ivector.yaml deleted file mode 100644 index 85fee04b..00000000 --- a/tests/data/configs/basic_train_ivector.yaml +++ /dev/null @@ -1,44 +0,0 @@ -beam: 10 -retry_beam: 40 -use_mp: false - -features: - type: "mfcc" - use_energy: false - frame_shift: 10 - pitch: false - -training: - - monophone: - num_iterations: 5 - max_gaussians: 1000 - subset: 100 - - - triphone: - num_iterations: 3 - num_leaves: 250 - max_gaussians: 2000 - cluster_threshold: -1 - subset: 1000 - - - lda: - num_iterations: 2 - num_leaves: 500 - max_gaussians: 4000 - subset: 1000 - features: - splice_left_context: 3 - splice_right_context: 3 - - - ivector: - num_iterations: 2 - gaussian_min_count: 2 - ubm_num_iterations_init: 4 - ubm_num_threads: 4 - silence_weight: 0.0 - posterior_scale: 0.1 - max_count: 100 - features: - lda: true - splice_left_context: 3 - splice_right_context: 3 \ No newline at end of file diff --git a/tests/data/configs/ivector_train.yaml b/tests/data/configs/ivector_train.yaml index 0bccd397..b4747f2a 100644 --- a/tests/data/configs/ivector_train.yaml +++ b/tests/data/configs/ivector_train.yaml @@ -1,20 +1,12 @@ -beam: 10 -retry_beam: 400 use_mp: false features: type: "mfcc" - use_energy: false + use_energy: true frame_shift: 10 training: - - monophone: - num_iterations: 4 - max_gaussians: 500 - subset: 1000 - - ivector: num_iterations: 2 gaussian_min_count: 2 ubm_num_iterations_init: 4 - ubm_num_threads: 4 diff --git a/tests/data/configs/mono_train.yaml b/tests/data/configs/mono_train.yaml index f475a290..8c3bdc2f 100644 --- a/tests/data/configs/mono_train.yaml +++ b/tests/data/configs/mono_train.yaml @@ -4,7 +4,7 @@ use_mp: false features: type: "mfcc" - use_energy: false + use_energy: true frame_shift: 10 pitch: false diff --git a/tests/data/configs/transcribe.yaml b/tests/data/configs/transcribe.yaml new file mode 100644 index 00000000..cc4c8e3c --- /dev/null +++ b/tests/data/configs/transcribe.yaml @@ -0,0 +1,3 @@ +use_mp: false + + diff --git a/tests/data/lm/basic_lm.arpa b/tests/data/lm/basic_lm.arpa deleted file mode 100644 index 88df38bf..00000000 --- a/tests/data/lm/basic_lm.arpa +++ /dev/null @@ -1,414 +0,0 @@ - -\data\ -ngram 1=84 -ngram 2=148 -ngram 3=169 - -\1-grams: --99 -0.102305 --2.170262 --1.568202 this -0.500245 --1.869232 is -0.102305 --1.471292 the -0.2484331 --1.869232 acoustic -0.403335 --1.869232 corpus -0.2272438 --1.693141 i'm -0.1023051 --1.869232 talking -0.1023051 --2.170262 pretty -0.102305 --2.170262 fast -0.102305 --1.869232 here -0.102305 --1.869232 there's -0.1023051 --2.170262 nothing -0.102305 --1.869232 going -0.1023051 --2.170262 else -0.102305 --2.170262 on -0.102305 --1.869232 we're -0.403335 --1.869232 just -0.1023051 --2.170262 yknow -0.102305 --1.693141 some -0.102305 --2.170262 speech -0.102305 --2.170262 errors -0.102305 --2.170262 but -0.102305 --2.170262 who -0.102305 --2.170262 cares -0.102305 --1.568202 um -0.199215 --2.170262 me -0.102305 --2.170262 really -0.102305 --2.170262 slow -0.102305 --1.392111 and -0.2272438 --2.170262 slightly -0.102305 --2.170262 lower -0.102305 --1.869232 in -0.1023051 --2.170262 intensity -0.102305 --1.693141 saying -0.1023051 --1.471292 words -0.1814863 --2.170262 here's -0.1023051 --2.170262 more -0.102305 --2.170262 word -0.102305 --1.568202 that -0.2272438 --2.170262 should -0.102305 --2.170262 be -0.102305 --2.170262 all -0.102305 --1.869232 thanks -0.403335 --1.325164 uh -0.2114495 --1.568202 so -0.3241538 --2.170262 sick -0.102305 --1.471292 i -0.2783963 --2.170262 have -0.102305 --1.568202 a -0.102305 --2.170262 cold -0.102305 --1.568202 probably -0.102305 --2.170262 sound -0.102305 --1.869232 quite -0.403335 --1.693141 different -0.2272438 --2.170262 than -0.102305 --2.170262 recording -0.1023051 --2.170262 environment -0.102305 --1.869232 also -0.102305 --1.869232 bunch -0.403335 --2.170262 of -0.102305 --2.170262 did -0.102305 --2.170262 not -0.102305 --2.170262 say -0.102305 --2.170262 original -0.102305 --2.170262 one -0.1023051 --2.170262 long -0.102305 --2.170262 pause -0.102305 --2.170262 think -0.1023051 --1.869232 good -0.403335 --1.869232 alright -0.102305 --2.170262 hopefully -0.102305 --2.170262 levels -0.102305 --2.170262 are -0.102305 --2.170262 okay -0.102305 --2.170262 sounds -0.102305 --2.170262 lot -0.102305 --2.170262 [adif] -0.102305 --2.170262 gonna -0.102305 --2.170262 cough -0.102305 --2.170262 -0.102305 --2.170262 yeah -0.102305 --2.170262 happened -0.1023051 - -\2-grams: --1.039465 this -0.07255066 --0.9692807 uh -0.07255067 --1.09347 alright -0.07255066 --0.189602 this is -0.2943994 --1.296561 this probably -0.07255066 --1.101481 is the -0.2486419 --1.238014 is me -0.07255067 --1.131803 is probably -0.07255066 --1.199651 is also -0.07255067 --0.7436042 the acoustic -0.3735807 --1.246603 the uh -0.07255066 --1.471139 the sick -0.3735807 --0.7528794 the recording -0.07255066 --1.471139 the original -0.07255066 --0.214473 acoustic corpus -0.07255069 --1.190567 corpus i'm -0.07255066 --1.164404 corpus um -0.07255067 --0.4808323 corpus uh -0.07255064 --1.199651 i'm talking -0.07255067 --1.164404 i'm saying -0.07255067 --1.131803 i'm probably -0.07255066 --1.199651 i'm also -0.07255067 --0.9575152 talking pretty -0.07255066 --0.9575152 talking really -0.07255066 --0.667127 pretty fast -0.07255067 --0.6564853 fast here -0.07255067 --0.9369835 here there's -0.07255067 --0.9575152 here -0.07255066 --0.9575152 there's nothing -0.07255067 --0.9173787 there's some -0.07255067 --0.6564853 nothing going -0.07255067 --0.9575152 going else -0.07255067 --0.9575152 going on -0.07255067 --0.6564853 else going -0.07255067 --0.6564853 on we're -0.07255066 --0.214473 we're just -0.07255067 --1.123219 just yknow -0.07255067 --1.065629 just saying -0.07255066 --1.123219 just happened -0.07255067 --0.6564853 yknow there's -0.07255067 --1.123219 some speech -0.07255066 --1.014788 some words -0.07255066 --1.123219 some more -0.07255066 --0.667127 speech errors -0.07255066 --0.667127 errors but -0.07255066 --0.667127 but who -0.07255066 --0.667127 who cares -0.07255067 --0.6359535 cares um -0.07255067 --1.228714 um this -0.07255066 --1.198391 um the -0.07255066 --1.170047 um and -0.07255067 --0.5794997 um i -0.07255066 --0.6564853 me talking -0.07255067 --0.667127 really slow -0.07255066 --0.6163487 slow and -0.07255066 --0.7871559 and i'm -0.07255067 --1.519439 and slightly -0.07255066 --0.8090003 and here's -0.07255066 --1.374167 and that -0.07255066 --1.265499 and uh -0.07255067 --1.334845 and i -0.07255067 --0.667127 slightly lower -0.07255067 --0.6564853 lower in -0.07255067 --0.8806397 in the -0.07255066 --0.9575152 in intensity -0.07255067 --0.6564853 intensity we're -0.07255066 --1.065629 saying some -0.07255067 --1.039465 saying a -0.07255066 --1.123219 saying [adif] -0.07255067 --0.6586809 words um -0.07255068 --1.209906 words and -0.07255067 --1.242423 words words -0.07255067 --1.404192 words word -0.07255066 --1.277572 words that -0.07255067 --0.9173787 here's some -0.07255067 --0.8986208 here's a -0.07255066 --0.6260405 more words -0.07255067 --0.6260405 word words -0.07255067 --1.218409 that just -0.07255067 --0.5136076 that should -0.3735807 --1.139727 that i -0.07255066 --0.667127 should be -0.07255067 --0.9575152 be all -0.07255067 --0.9369835 be good -0.07255066 --0.6564853 all thanks -0.07255066 --0.2163768 thanks --1.49998 uh acoustic -0.07255066 --0.567904 uh and -0.07255066 --1.398714 uh that -0.07255066 --1.280787 uh uh -0.07255067 --1.398714 uh so -0.07255067 --1.355747 uh i -0.07255067 --1.561111 uh hopefully -0.07255066 --0.3421909 so this -0.2486419 --1.261314 so that -0.07255066 --1.236637 so i -0.07255067 --0.6564853 sick corpus -0.3735807 --0.6878116 i have -0.3735807 --1.307895 i probably -0.07255066 --1.414105 i did -0.07255066 --0.6878116 i think -0.07255066 --0.6359535 have a -0.3735807 --1.238014 a cold -0.3735807 --1.199651 a bunch -0.07255066 --1.238014 a long -0.07255066 --1.238014 a lot -0.07255067 --0.6359535 cold so -0.07255066 --1.238014 probably sound -0.07255067 --1.199651 probably good -0.07255066 --1.238014 probably sounds -0.07255067 --1.238014 probably gonna -0.07255066 --0.6564853 sound quite -0.07255068 --0.2125776 quite different -0.07255066 --1.116377 different and -0.07255067 --1.139727 different words -0.07255068 --0.5136076 different than -0.3735807 --0.6260405 than the -0.07255066 --0.9575152 recording environment -0.07255067 --0.9575152 recording levels -0.07255066 --0.6564853 environment is -0.07255066 --0.9173787 also saying -0.07255067 --0.9369835 also quite -0.07255068 --0.2163768 bunch of -0.3735807 --0.6460981 of different -0.3735807 --0.667127 did not -0.07255066 --0.667127 not say -0.07255067 --0.6564853 say in -0.07255066 --0.667127 original one -0.07255067 --0.6068685 one uh -0.07255067 --0.667127 long pause -0.07255066 --0.6163487 pause and -0.07255067 --0.8986208 think this -0.07255066 --0.9173787 think i'm -0.07255066 --0.214473 good alright -0.3735807 --0.9369835 alright thanks -0.3735807 --0.8986208 alright so -0.07255067 --0.6260405 hopefully the -0.07255066 --0.667127 levels are -0.07255066 --0.667127 are okay -0.07255067 --0.6359535 okay um -0.07255066 --0.6359535 sounds a -0.07255066 --0.6460981 lot different -0.07255066 --0.6564853 [adif] bunch -0.07255066 --0.667127 gonna cough -0.07255067 --0.6564853 cough here -0.07255067 --0.667127 yeah -0.07255067 --0.6359535 yeah so -0.07255066 --0.6068685 happened uh -0.07255067 - -\3-grams: --0.1544866 this is --0.7266929 uh so --0.5838433 alright so --0.3270165 this is the --1.220997 this is me --1.165904 this is probably --0.6930168 this probably sounds --0.8150716 is the acoustic --0.3939632 is the sick --0.4679228 is me talking --0.6834472 is probably good --0.5991604 is also quite --0.07825259 the acoustic corpus --0.7029566 the uh uh --0.1737733 the sick corpus --0.7689528 the recording environment --0.7689528 the recording levels --0.4737236 the original one --0.9753377 acoustic corpus i'm --0.961654 acoustic corpus um --0.4802593 acoustic corpus uh --0.6834472 corpus i'm talking --0.683122 corpus um the --0.5526418 corpus uh and --1.052657 corpus uh i --1.127683 corpus uh hopefully --0.6070263 i'm talking pretty --0.6361785 i'm saying a --0.6930168 i'm probably gonna --0.5914343 i'm also saying --0.4737236 talking pretty fast --0.4737236 talking really slow --0.4679228 pretty fast here --0.5991604 fast here there's --0.6070263 here there's nothing --0.4737236 here yeah --0.4679228 there's nothing going --0.662422 there's some speech --0.6070263 nothing going else --0.4679228 going else going --0.4679228 going on we're --0.6070263 else going on --0.1737733 on we're just --0.8519024 we're just yknow --0.8248622 we're just saying --0.4679228 just yknow there's --0.6447514 just saying some --0.4400271 just happened uh --0.5914343 yknow there's some --0.4737236 some speech errors --0.6860678 some words and --0.4509714 some more words --0.4737236 speech errors but --0.4737236 errors but who --0.4737236 but who cares --0.4565486 who cares um --0.6907549 cares um this --0.1544866 um this is --0.5180966 um the recording --0.7221693 um and that --0.6010913 um i have --0.6010913 um i think --0.6070263 me talking really --0.4454648 really slow and --0.7460986 slow and slightly --0.870078 and i'm saying --0.8848732 and i'm also --0.4737236 and slightly lower --0.7464948 and here's some --0.7356867 and here's a --0.3838717 and that should --0.7266929 and uh that --0.4848134 and i think --0.4679228 slightly lower in --0.6070263 lower in intensity --0.738874 in the original --0.4679228 in intensity we're --0.1737733 intensity we're just --0.6277716 saying some words --0.6834472 saying a bunch --0.4679228 saying [adif] bunch --0.8724939 words um and --0.5232542 words um i --0.5448476 words and here's --0.7276742 words words word --0.4509714 words word words --0.6671918 words that i --0.662422 here's some more --0.6930168 here's a long --0.6940763 more words words --0.469125 word words um --0.662422 that just happened --0.1752394 that should be --0.7294244 that i did --0.7689528 should be all --0.7575786 should be good --0.4679228 be all thanks --0.1737733 be good alright --0.1752394 all thanks --0.1737733 uh acoustic corpus --0.7225884 uh and i'm --0.7384094 uh and here's --1.012349 uh and uh --0.3838717 uh that should --0.74327 uh uh acoustic --0.2686797 uh so this --0.4848134 uh i have --0.4509714 uh hopefully the --0.1254228 so this is --1.098112 so this probably --0.6882056 so that just --0.7088747 so i probably --0.144633 sick corpus uh --0.170856 i have a --0.6930168 i probably sound --0.4737236 i did not --0.7356867 i think this --0.7464948 i think i'm --0.2208508 have a cold --0.170856 a cold so --0.1752394 a bunch of --0.4737236 a long pause --0.4621985 a lot different --0.3355989 cold so this --0.8996548 cold so i --0.4679228 probably sound quite --0.1737733 probably good alright --0.4565486 probably sounds a --0.4737236 probably gonna cough --0.1723122 sound quite different --0.8487894 quite different and --0.4733422 quite different than --0.53465 different and i'm --0.5806979 different words um --0.915142 different words that --0.1694047 different than the --0.6389804 than the acoustic --0.903509 than the uh --0.4679228 recording environment is --0.4737236 recording levels are --0.6834472 environment is also --0.662422 also saying [adif] --0.1723122 also quite different --0.1723122 bunch of different --0.2163883 of different words --0.4737236 did not say --0.4679228 not say in --0.5763827 say in the --0.4400271 original one uh --0.4171492 one uh and --0.4454648 long pause and --0.7144772 pause and i --0.1544866 think this is --0.6649185 think i'm probably --0.2035385 good alright thanks --0.07884029 alright thanks --0.2686797 alright so this --0.5180966 hopefully the recording --0.4737236 levels are okay --0.4565486 are okay um --0.4240461 okay um i --0.6930168 sounds a lot --0.3838717 lot different than --0.1752394 [adif] bunch of --0.4679228 gonna cough here --0.6070263 cough here --0.4565486 yeah so --0.6985244 yeah so that --0.4171492 happened uh and - -\end\ diff --git a/tests/data/lm/basic_lm.zip b/tests/data/lm/basic_lm.zip new file mode 100644 index 00000000..44e14c09 Binary files /dev/null and b/tests/data/lm/basic_lm.zip differ diff --git a/tests/test_aligner.py b/tests/test_aligner.py index 48c3f976..232358d6 100644 --- a/tests/test_aligner.py +++ b/tests/test_aligner.py @@ -4,23 +4,11 @@ from montreal_forced_aligner.aligner import TrainableAligner, PretrainedAligner from montreal_forced_aligner.models import AcousticModel -from montreal_forced_aligner.config import load_basic_align #@pytest.mark.skip(reason='Optimization') -def test_sick_ivector(sick_dict, sick_corpus, generated_dir, ivector_train_config): - shutil.rmtree(sick_corpus.output_directory, ignore_errors=True) - os.makedirs(sick_corpus.output_directory, exist_ok=True) - ivector_train_config, align_config = ivector_train_config - data_directory = os.path.join(generated_dir, 'temp', 'ivector_test') - shutil.rmtree(data_directory, ignore_errors=True) - a = TrainableAligner(sick_corpus, sick_dict, ivector_train_config, align_config, - temp_directory=data_directory) - a.train() - - -#@pytest.mark.skip(reason='Optimization') -def test_sick_mono(sick_dict, sick_corpus, generated_dir, mono_train_config, mono_align_model_path, mono_align_config, mono_output_directory): +def test_sick_mono(sick_dict, sick_corpus, generated_dir, mono_train_config, mono_align_model_path, + mono_align_config, mono_output_directory): shutil.rmtree(sick_corpus.output_directory, ignore_errors=True) os.makedirs(sick_corpus.output_directory, exist_ok=True) mono_train_config, align_config = mono_train_config diff --git a/tests/test_commandline_align.py b/tests/test_commandline_align.py index 57d9d2f5..f5163715 100644 --- a/tests/test_commandline_align.py +++ b/tests/test_commandline_align.py @@ -1,7 +1,8 @@ import os import pytest -from montreal_forced_aligner.command_line.align import run_align_corpus, DummyArgs +from montreal_forced_aligner.command_line.align import run_align_corpus +from montreal_forced_aligner.command_line.mfa import parser from montreal_forced_aligner.exceptions import PronunciationAcousticMismatchError @@ -21,21 +22,14 @@ def assert_export_exist(old_directory, new_directory): #@pytest.mark.skip(reason='Optimization') def test_align_basic(basic_corpus_dir, sick_dict_path, generated_dir, large_dataset_dictionary, temp_dir, basic_align_config, english_acoustic_model): - args = DummyArgs() - args.acoustic_model_path = 'english' - args.corpus_directory = basic_corpus_dir - args.dictionary_path = sick_dict_path - args.output_directory = os.path.join(generated_dir, 'basic_output') - args.quiet = True - args.clean = True - args.temp_directory = temp_dir - args.config_path = basic_align_config + command = ['align', basic_corpus_dir, sick_dict_path, 'english', os.path.join(generated_dir, 'basic_output'), + '-t', temp_dir, '-c', basic_align_config, '-q', '--clean', '-d'] + args, unknown = parser.parse_known_args(command) with pytest.raises(PronunciationAcousticMismatchError): - run_align_corpus(args) + run_align_corpus(args, unknown) - args.acoustic_model_path = 'english' - args.corpus_directory = basic_corpus_dir - args.dictionary_path = large_dataset_dictionary - args.output_directory = os.path.join(generated_dir, 'basic_output') - run_align_corpus(args) + command = ['align', basic_corpus_dir, large_dataset_dictionary, 'english', os.path.join(generated_dir, 'basic_output'), + '-t', temp_dir, '-c', basic_align_config, '-q', '--clean', '-d'] + args, unknown = parser.parse_known_args(command) + run_align_corpus(args, unknown) diff --git a/tests/test_commandline_classify_speakers.py b/tests/test_commandline_classify_speakers.py new file mode 100644 index 00000000..7b8ffc73 --- /dev/null +++ b/tests/test_commandline_classify_speakers.py @@ -0,0 +1,25 @@ +import os +import pytest + +from montreal_forced_aligner.command_line.classify_speakers import run_classify_speakers +from montreal_forced_aligner.command_line.mfa import parser + + +def test_classify(basic_corpus_dir, sick_dict_path, english_ivector_model, generated_dir, + transcription_acoustic_model, transcription_language_model, temp_dir): + output_path = os.path.join(generated_dir, 'classify_test') + command = ['classify_speakers', basic_corpus_dir, 'english_ivector', + output_path, + '-t', temp_dir, '-q', '--clean', '-d', '--disable_mp'] + args, unknown = parser.parse_known_args(command) + run_classify_speakers(args) + + +def test_cluster(basic_corpus_dir, sick_dict_path, english_ivector_model, generated_dir, + transcription_acoustic_model, transcription_language_model, temp_dir): + output_path = os.path.join(generated_dir, 'cluster_test') + command = ['classify_speakers', basic_corpus_dir, 'english_ivector', + output_path, + '-t', temp_dir, '-q', '--clean', '-d', '--cluster', '-s', '2', '--disable_mp'] + args, unknown = parser.parse_known_args(command) + run_classify_speakers(args) \ No newline at end of file diff --git a/tests/test_commandline_create_segments.py b/tests/test_commandline_create_segments.py new file mode 100644 index 00000000..c84cbf8d --- /dev/null +++ b/tests/test_commandline_create_segments.py @@ -0,0 +1,15 @@ +import os +import pytest + +from montreal_forced_aligner.command_line.create_segments import run_create_segments +from montreal_forced_aligner.command_line.mfa import parser + + +def test_create_segments(basic_corpus_dir, sick_dict_path, english_acoustic_model, generated_dir, + transcription_acoustic_model, transcription_language_model, temp_dir, basic_segment_config): + output_path = os.path.join(generated_dir, 'segment_output') + command = ['create_segments', basic_corpus_dir, + output_path, + '-t', temp_dir, '-q', '--clean', '-d', '--config', basic_segment_config] + args, unknown = parser.parse_known_args(command) + run_create_segments(args) \ No newline at end of file diff --git a/tests/test_commandline_g2p.py b/tests/test_commandline_g2p.py index e36824ac..40f1a0af 100644 --- a/tests/test_commandline_g2p.py +++ b/tests/test_commandline_g2p.py @@ -4,29 +4,17 @@ from montreal_forced_aligner.g2p.generator import G2P_DISABLED from montreal_forced_aligner.command_line.g2p import run_g2p from montreal_forced_aligner.command_line.train_g2p import run_train_g2p +from montreal_forced_aligner.command_line.mfa import parser from montreal_forced_aligner.dictionary import Dictionary -class G2PDummyArgs(object): - def __init__(self): - self.temp_directory = None - self.window_size = 2 - self.order = 5 - self.random_starts=1 - self.num_jobs = 2 - self.disable_mp = False - self.include_bracketed = False - - def test_train_g2p(sick_dict_path, sick_g2p_model_path, temp_dir): if G2P_DISABLED: pytest.skip('No Pynini found') - args = G2PDummyArgs() - args.validate = True - args.dictionary_path = sick_dict_path - args.output_model_path = sick_g2p_model_path - args.temp_directory = temp_dir + command = ['train_g2p', sick_dict_path, sick_g2p_model_path, + '-t', temp_dir, '-q', '--clean', '-d', '--validate'] + args, unknown = parser.parse_known_args(command) run_train_g2p(args) assert os.path.exists(sick_g2p_model_path) @@ -34,11 +22,9 @@ def test_train_g2p(sick_dict_path, sick_g2p_model_path, temp_dir): def test_generate_dict(basic_corpus_dir, sick_g2p_model_path, g2p_sick_output, temp_dir): if G2P_DISABLED: pytest.skip('No Pynini found') - args = G2PDummyArgs() - args.g2p_model_path = sick_g2p_model_path - args.input_path = basic_corpus_dir - args.output_path = g2p_sick_output - args.temp_directory = temp_dir + command = ['g2p', sick_g2p_model_path, basic_corpus_dir, g2p_sick_output, + '-t', temp_dir, '-q', '--clean', '-d'] + args, unknown = parser.parse_known_args(command) run_g2p(args) assert os.path.exists(g2p_sick_output) d = Dictionary(g2p_sick_output, temp_dir) @@ -48,11 +34,9 @@ def test_generate_dict(basic_corpus_dir, sick_g2p_model_path, g2p_sick_output, t def test_generate_orthography_dict(basic_corpus_dir, orth_sick_output, temp_dir): if G2P_DISABLED: pytest.skip('No Pynini found') - args = G2PDummyArgs() - args.g2p_model_path = None - args.input_path = basic_corpus_dir - args.output_path = orth_sick_output - args.temp_directory = temp_dir + command = ['g2p', basic_corpus_dir, orth_sick_output, + '-t', temp_dir, '-q', '--clean', '-d'] + args, unknown = parser.parse_known_args(command) run_g2p(args) assert os.path.exists(orth_sick_output) d = Dictionary(orth_sick_output, temp_dir) diff --git a/tests/test_commandline_lm.py b/tests/test_commandline_lm.py index 04e3d99d..f1a7ca16 100644 --- a/tests/test_commandline_lm.py +++ b/tests/test_commandline_lm.py @@ -3,33 +3,14 @@ import sys from montreal_forced_aligner.command_line.train_lm import run_train_lm - - -class DummyArgs(object): - def __init__(self): - self.source_path = '' - self.output_model_path = '' - self.dictionary_path = '' - self.speaker_characters = 0 - self.dictionary = '' - self.config_path = '' - self.model_path = '' - self.model_weight = 1.0 - self.num_jobs = 0 - self.verbose = False - self.clean = True - self.fast = True - self.debug = False - self.temp_directory = None +from montreal_forced_aligner.command_line.mfa import parser def test_train_lm(basic_corpus_dir, temp_dir, generated_dir, basic_train_lm_config): if sys.platform == 'win32': pytest.skip('LM training not supported on Windows.') - args = DummyArgs() - args.source_path = basic_corpus_dir - args.temp_directory = temp_dir - args.config_path = basic_train_lm_config - args.output_model_path = os.path.join(generated_dir, 'test_basic_lm.arpa') + command = ['train_lm', basic_corpus_dir, os.path.join(generated_dir, 'test_basic_lm.zip'), + '-t', temp_dir, '-c', basic_train_lm_config, '-q', '--clean'] + args, unknown = parser.parse_known_args(command) run_train_lm(args) assert os.path.exists(args.output_model_path) diff --git a/tests/test_commandline_thirdparty.py b/tests/test_commandline_thirdparty.py index 6727c536..aa33440d 100644 --- a/tests/test_commandline_thirdparty.py +++ b/tests/test_commandline_thirdparty.py @@ -6,12 +6,9 @@ from montreal_forced_aligner.thirdparty.download import download_binaries from montreal_forced_aligner.thirdparty.kaldi import validate_kaldi_binaries, collect_kaldi_binaries from montreal_forced_aligner.config import TEMP_DIR +from montreal_forced_aligner.command_line.mfa import parser -class ThirdpartyDummyArgs(object): - def __init__(self): - self.command = '' - def test_download(): bin_dir = os.path.join(TEMP_DIR, 'thirdparty', 'bin') @@ -20,7 +17,9 @@ def test_download(): assert not validate_kaldi_binaries() - download_binaries() + command = ['thirdparty', 'download'] + args, unknown = parser.parse_known_args(command) + run_thirdparty(args) assert validate_kaldi_binaries() @@ -36,15 +35,14 @@ def test_collect(): assert not validate_kaldi_binaries() - collect_kaldi_binaries(backup_dir) + command = ['thirdparty', 'kaldi', backup_dir] + args, unknown = parser.parse_known_args(command) + run_thirdparty(args) assert validate_kaldi_binaries() def test_validate(): - args = ThirdpartyDummyArgs() - with pytest.raises(ArgumentError): - - run_thirdparty(args) - args.command = 'validate' + command = ['thirdparty', 'validate'] + args, unknown = parser.parse_known_args(command) run_thirdparty(args) diff --git a/tests/test_commandline_train.py b/tests/test_commandline_train.py index 65f47df6..ecbadd34 100644 --- a/tests/test_commandline_train.py +++ b/tests/test_commandline_train.py @@ -1,8 +1,7 @@ import os -from montreal_forced_aligner.command_line.align import DummyArgs - from montreal_forced_aligner.command_line.train_and_align import run_train_corpus +from montreal_forced_aligner.command_line.mfa import parser # @pytest.mark.skip(reason='Optimization') @@ -10,16 +9,8 @@ def test_train_and_align_basic(basic_corpus_dir, sick_dict_path, generated_dir, mono_train_config_path, textgrid_output_model_path): if os.path.exists(textgrid_output_model_path): os.remove(textgrid_output_model_path) - args = DummyArgs() - args.corpus_directory = basic_corpus_dir - args.dictionary_path = sick_dict_path - args.output_directory = os.path.join(generated_dir, 'basic_output') - args.quiet = True - args.clean = True - args.temp_directory = temp_dir - args.config_path = mono_train_config_path - args.output_model_path = textgrid_output_model_path - - args.corpus_directory = basic_corpus_dir - run_train_corpus(args) - assert os.path.exists(args.output_model_path) + command = ['train', basic_corpus_dir, sick_dict_path, os.path.join(generated_dir, 'basic_output'), + '-t', temp_dir, '-c', mono_train_config_path, '-q', '--clean', '-d', '-o', textgrid_output_model_path] + args, unknown = parser.parse_known_args(command) + run_train_corpus(args, unknown) + assert os.path.exists(textgrid_output_model_path) diff --git a/tests/test_commandline_train_dict.py b/tests/test_commandline_train_dict.py new file mode 100644 index 00000000..f9155f28 --- /dev/null +++ b/tests/test_commandline_train_dict.py @@ -0,0 +1,15 @@ +import os +import pytest + +from montreal_forced_aligner.command_line.train_dictionary import run_train_dictionary +from montreal_forced_aligner.command_line.mfa import parser + + +def test_train_dict(basic_corpus_dir, sick_dict_path, english_acoustic_model, generated_dir, + transcription_acoustic_model, transcription_language_model, temp_dir, basic_align_config): + output_path = os.path.join(generated_dir, 'trained_dict.txt') + command = ['train_dictionary', basic_corpus_dir, sick_dict_path, transcription_acoustic_model, + output_path, + '-t', temp_dir, '-q', '--clean', '-d', '--config', basic_align_config] + args, unknown = parser.parse_known_args(command) + run_train_dictionary(args) \ No newline at end of file diff --git a/tests/test_commandline_train_ivector.py b/tests/test_commandline_train_ivector.py index 21b24515..50d79a41 100644 --- a/tests/test_commandline_train_ivector.py +++ b/tests/test_commandline_train_ivector.py @@ -1,34 +1,14 @@ import os from montreal_forced_aligner.command_line.train_ivector_extractor import run_train_ivector_extractor - - -class DummyArgs(object): - def __init__(self): - self.corpus_directory = '' - self.dictionary_path = '' - self.speaker_characters = 0 - self.num_jobs = 0 - self.verbose = False - self.clean = True - self.fast = True - self.debug = False - self.temp_directory = None - self.config_path = '' - self.output_model_path = '' +from montreal_forced_aligner.command_line.mfa import parser # @pytest.mark.skip(reason='Optimization') -def test_basic_ivector(basic_corpus_dir, sick_dict_path, generated_dir, large_dataset_dictionary, temp_dir, - basic_train_ivector_config, english_acoustic_model, ivector_output_model_path): - args = DummyArgs() - args.corpus_directory = basic_corpus_dir - args.quiet = True - args.clean = True - args.temp_directory = temp_dir - args.dictionary_path = large_dataset_dictionary - args.output_directory = os.path.join(generated_dir, 'basic_output') - args.output_model_path = ivector_output_model_path - args.config_path = basic_train_ivector_config +def test_basic_ivector(basic_corpus_dir, generated_dir, large_dataset_dictionary, temp_dir, + train_ivector_config, english_acoustic_model, ivector_output_model_path): + command = ['train_ivector', basic_corpus_dir, large_dataset_dictionary, 'english', ivector_output_model_path, + '-t', temp_dir, '-c', train_ivector_config, '-q', '--clean', '-d'] + args, unknown = parser.parse_known_args(command) run_train_ivector_extractor(args) assert os.path.exists(args.output_model_path) diff --git a/tests/test_commandline_transcribe.py b/tests/test_commandline_transcribe.py index 3ee5d050..77998c42 100644 --- a/tests/test_commandline_transcribe.py +++ b/tests/test_commandline_transcribe.py @@ -2,34 +2,14 @@ import pytest from montreal_forced_aligner.command_line.transcribe import run_transcribe_corpus - - -class DummyArgs(object): - def __init__(self): - self.corpus_directory = '' - self.dictionary_path = '' - self.acoustic_model_path = '' - self.output_directory = '' - self.config_path = '' - self.speaker_characters = 0 - self.num_jobs = 0 - self.verbose = False - self.clean = True - self.fast = True - self.debug = False - self.evaluate = False - self.temp_directory = None +from montreal_forced_aligner.command_line.mfa import parser def test_transcribe(basic_corpus_dir, sick_dict_path, english_acoustic_model, generated_dir, - transcription_acoustic_model, transcription_language_model, temp_dir): + transcription_acoustic_model, transcription_language_model, temp_dir, transcribe_config): output_path = os.path.join(generated_dir, 'transcribe_test') - args = DummyArgs() - args.acoustic_model_path = transcription_acoustic_model - args.corpus_directory = basic_corpus_dir - args.dictionary_path = sick_dict_path - args.language_model_path = transcription_language_model - args.output_directory = output_path - args.temp_directory = temp_dir - args.evaluate = True - run_transcribe_corpus(args) + command = ['transcribe', basic_corpus_dir, sick_dict_path, transcription_acoustic_model, + transcription_language_model, output_path, + '-t', temp_dir, '-q', '--clean', '-d', '--config', transcribe_config] + args, unknown = parser.parse_known_args(command) + run_transcribe_corpus(args) \ No newline at end of file diff --git a/tests/test_commandline_validate.py b/tests/test_commandline_validate.py index 79b56d80..72ffd188 100644 --- a/tests/test_commandline_validate.py +++ b/tests/test_commandline_validate.py @@ -1,23 +1,11 @@ from montreal_forced_aligner.command_line.validate import run_validate_corpus +from montreal_forced_aligner.command_line.mfa import parser -class ValidatorDummyArgs(object): - def __init__(self): - self.temp_directory = None - self.test_transcriptions = False - self.num_jobs = 0 - self.speaker_characters = 0 - self.ignore_acoustics = False - self.disable_mp = False +def test_validate_corpus(large_prosodylab_format_directory, large_dataset_dictionary, temp_dir): - -def test_validate_corpus(large_prosodylab_format_directory, large_dataset_dictionary,temp_dir): - args = ValidatorDummyArgs() - args.num_jobs = 2 - args.corpus_directory = large_prosodylab_format_directory - args.dictionary_path = large_dataset_dictionary - args.temp_directory = temp_dir - args.test_transcriptions = True - args.disable_mp = True + command = ['validate', large_prosodylab_format_directory, large_dataset_dictionary, 'english', + '-t', temp_dir, '-q', '--clean', '-d', '--disable_mp', '--test_transcriptions', '-j', '0'] + args, unknown = parser.parse_known_args(command) run_validate_corpus(args) diff --git a/tests/test_config.py b/tests/test_config.py index a7789f49..08079212 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -3,6 +3,7 @@ from montreal_forced_aligner.config import FeatureConfig, train_yaml_to_config, align_yaml_to_config, ConfigError from montreal_forced_aligner.trainers import MonophoneTrainer, TriphoneTrainer, LdaTrainer, SatTrainer + def test_monophone_config(): config = MonophoneTrainer(FeatureConfig()) assert config.realignment_iterations == [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, @@ -27,15 +28,27 @@ def test_load_align(config_directory, mono_align_config_path): align = align_yaml_to_config(path) - def test_load_mono_train(config_directory, mono_train_config_path): train, align = train_yaml_to_config(mono_train_config_path) for t in train.training_configs: assert not t.use_mp assert not t.feature_config.use_mp + assert t.feature_config.use_energy + assert not align.use_mp + assert not align.feature_config.use_mp + assert align.feature_config.use_energy + + +def test_load_ivector_train(config_directory, train_ivector_config): + train, align = train_yaml_to_config(train_ivector_config) + for t in train.training_configs: + assert not t.use_mp + assert not t.feature_config.use_mp + assert t.feature_config.use_energy assert not align.use_mp assert not align.feature_config.use_mp + def test_load(config_directory): path = os.path.join(config_directory, 'basic_train_config.yaml') train, align = train_yaml_to_config(path) diff --git a/tests/test_corpus.py b/tests/test_corpus.py index ba926a49..ee12bc6d 100644 --- a/tests/test_corpus.py +++ b/tests/test_corpus.py @@ -12,7 +12,7 @@ def test_basic(basic_dict_path, basic_corpus_dir, generated_dir, default_feature dictionary = Dictionary(basic_dict_path, os.path.join(generated_dir, 'basic')) dictionary.write() output_directory = os.path.join(generated_dir, 'basic') - c = AlignableCorpus(basic_corpus_dir, output_directory) + c = AlignableCorpus(basic_corpus_dir, output_directory, use_mp=True) c.initialize_corpus(dictionary) default_feature_config.generate_features(c) assert c.get_feat_dim(default_feature_config) == 39 @@ -22,7 +22,7 @@ def test_basic_txt(basic_corpus_txt_dir, basic_dict_path, generated_dir, default dictionary = Dictionary(basic_dict_path, os.path.join(generated_dir, 'basic')) dictionary.write() output_directory = os.path.join(generated_dir, 'basic') - c = AlignableCorpus(basic_corpus_txt_dir, output_directory) + c = AlignableCorpus(basic_corpus_txt_dir, output_directory, use_mp=False) assert len(c.no_transcription_files) == 0 c.initialize_corpus(dictionary) default_feature_config.generate_features(c) @@ -31,7 +31,7 @@ def test_basic_txt(basic_corpus_txt_dir, basic_dict_path, generated_dir, default def test_extra(sick_dict, extra_corpus_dir, generated_dir): output_directory = os.path.join(generated_dir, 'extra') - corpus = AlignableCorpus(extra_corpus_dir, output_directory, num_jobs=2) + corpus = AlignableCorpus(extra_corpus_dir, output_directory, num_jobs=2, use_mp=False) corpus.initialize_corpus(sick_dict) @@ -39,7 +39,7 @@ def test_stereo(basic_dict_path, stereo_corpus_dir, temp_dir, default_feature_co temp = os.path.join(temp_dir, 'stereo') dictionary = Dictionary(basic_dict_path, os.path.join(temp, 'basic')) dictionary.write() - d = AlignableCorpus(stereo_corpus_dir, temp) + d = AlignableCorpus(stereo_corpus_dir, temp, use_mp=False) d.initialize_corpus(dictionary) default_feature_config.generate_features(d) assert d.get_feat_dim(default_feature_config) == 39 @@ -48,7 +48,7 @@ def test_stereo(basic_dict_path, stereo_corpus_dir, temp_dir, default_feature_co def test_24bit_wav(transcribe_corpus_24bit_dir, temp_dir, default_feature_config): temp = os.path.join(temp_dir, '24bit') - c = TranscribeCorpus(transcribe_corpus_24bit_dir, temp) + c = TranscribeCorpus(transcribe_corpus_24bit_dir, temp, use_mp=False) assert len(c.unsupported_bit_depths) == 1 with pytest.raises(CorpusError): c.initialize_corpus() @@ -58,7 +58,7 @@ def test_short_segments(basic_dict_path, shortsegments_corpus_dir, temp_dir, def temp = os.path.join(temp_dir, 'short_segments') dictionary = Dictionary(basic_dict_path, temp) dictionary.write() - corpus = AlignableCorpus(shortsegments_corpus_dir, temp) + corpus = AlignableCorpus(shortsegments_corpus_dir, temp, use_mp=False) corpus.initialize_corpus(dictionary) default_feature_config.generate_features(corpus) assert len(corpus.feat_mapping.keys()) == 2 @@ -75,7 +75,7 @@ def test_speaker_groupings(large_prosodylab_format_directory, temp_dir, large_da shutil.rmtree(output_directory, ignore_errors=True) d = Dictionary(large_dataset_dictionary, output_directory) d.write() - c = AlignableCorpus(large_prosodylab_format_directory, output_directory) + c = AlignableCorpus(large_prosodylab_format_directory, output_directory, use_mp=False) c.initialize_corpus(d) default_feature_config.generate_features(c) @@ -94,7 +94,7 @@ def test_speaker_groupings(large_prosodylab_format_directory, temp_dir, large_da shutil.rmtree(output_directory, ignore_errors=True) d.write() - c = AlignableCorpus(large_prosodylab_format_directory, output_directory, num_jobs=2) + c = AlignableCorpus(large_prosodylab_format_directory, output_directory, num_jobs=2, use_mp=False) c.initialize_corpus(d) default_feature_config.generate_features(c) @@ -116,7 +116,7 @@ def test_subset(large_prosodylab_format_directory, temp_dir, large_dataset_dicti shutil.rmtree(output_directory, ignore_errors=True) d = Dictionary(large_dataset_dictionary, output_directory) d.write() - c = AlignableCorpus(large_prosodylab_format_directory, output_directory) + c = AlignableCorpus(large_prosodylab_format_directory, output_directory, use_mp=False) c.initialize_corpus(d) sd = c.split_directory() @@ -136,7 +136,7 @@ def test_weird_words(weird_words_dir, temp_dir, sick_dict_path): assert d.words["i'm"][1]['pronunciation'] == ('ay', 'm') assert d.words["'m"][0]['pronunciation'] == ('m',) d.write() - c = AlignableCorpus(weird_words_dir, output_directory) + c = AlignableCorpus(weird_words_dir, output_directory, use_mp=False) c.initialize_corpus(d) print(c.utterance_oovs['weird_words']) assert c.utterance_oovs['weird_words'] == ['ajfish', 'asds-asda', 'sdasd'] \ No newline at end of file