From e88a65e9997d0be7adeedc558111a2c1126b9269 Mon Sep 17 00:00:00 2001 From: Michael McAuliffe Date: Tue, 14 Feb 2023 22:46:13 -0800 Subject: [PATCH] 2.2.0: Add support for tokenizers (#566) * Add support for tokenizers --- docs/source/reference/tokenization/helper.rst | 14 + docs/source/reference/tokenization/index.rst | 20 + .../reference/tokenization/tokenizer.rst | 13 + .../reference/tokenization/training.rst | 12 + docs/source/reference/top_level_index.rst | 1 + .../user_guide/corpus_creation/index.rst | 2 + .../user_guide/corpus_creation/tokenize.rst | 20 + .../corpus_creation/train_tokenizer.rst | 24 + montreal_forced_aligner/abc.py | 46 +- .../acoustic_modeling/base.py | 67 +-- .../acoustic_modeling/lda.py | 25 +- .../acoustic_modeling/monophone.py | 13 +- .../pronunciation_probabilities.py | 55 +-- .../acoustic_modeling/sat.py | 27 +- .../acoustic_modeling/trainer.py | 38 +- .../acoustic_modeling/triphone.py | 45 +- montreal_forced_aligner/alignment/adapting.py | 57 +-- montreal_forced_aligner/alignment/base.py | 18 +- montreal_forced_aligner/alignment/mixins.py | 41 +- .../alignment/multiprocessing.py | 124 +++-- .../alignment/pretrained.py | 30 +- montreal_forced_aligner/command_line/align.py | 2 +- montreal_forced_aligner/command_line/g2p.py | 2 +- montreal_forced_aligner/command_line/mfa.py | 4 + montreal_forced_aligner/command_line/model.py | 4 +- .../command_line/tokenize.py | 76 +++ .../command_line/train_tokenizer.py | 81 ++++ montreal_forced_aligner/command_line/utils.py | 9 +- montreal_forced_aligner/config.py | 4 +- .../corpus/acoustic_corpus.py | 68 +-- montreal_forced_aligner/corpus/base.py | 33 +- montreal_forced_aligner/corpus/features.py | 65 ++- .../corpus/ivector_corpus.py | 24 +- .../corpus/multiprocessing.py | 29 +- montreal_forced_aligner/corpus/text_corpus.py | 14 +- montreal_forced_aligner/data.py | 6 +- montreal_forced_aligner/db.py | 110 ++--- .../diarization/multiprocessing.py | 5 +- .../diarization/speaker_diarizer.py | 40 +- montreal_forced_aligner/dictionary/mixins.py | 41 +- .../dictionary/multispeaker.py | 46 +- montreal_forced_aligner/exceptions.py | 29 +- montreal_forced_aligner/g2p/generator.py | 17 +- montreal_forced_aligner/g2p/mixins.py | 2 +- .../g2p/phonetisaurus_trainer.py | 163 ++++--- montreal_forced_aligner/g2p/trainer.py | 116 ++--- montreal_forced_aligner/helper.py | 22 +- .../ivector/multiprocessing.py | 21 +- montreal_forced_aligner/ivector/trainer.py | 83 ++-- .../language_modeling/multiprocessing.py | 69 ++- .../language_modeling/trainer.py | 79 ++-- montreal_forced_aligner/models.py | 341 ++++++++++---- montreal_forced_aligner/online/alignment.py | 31 +- montreal_forced_aligner/textgrid.py | 6 +- .../tokenization/__init__.py | 6 + .../tokenization/tokenizer.py | 439 ++++++++++++++++++ .../tokenization/trainer.py | 286 ++++++++++++ .../transcription/multiprocessing.py | 303 ++++++------ .../transcription/transcriber.py | 101 ++-- montreal_forced_aligner/utils.py | 14 +- .../vad/multiprocessing.py | 3 +- montreal_forced_aligner/vad/segmenter.py | 9 +- .../validation/corpus_validator.py | 7 +- .../validation/dictionary_validator.py | 14 +- tests/conftest.py | 398 ++++++++-------- tests/data/tokenizer/test_tokenizer_model.zip | Bin 0 -> 72507 bytes tests/test_acoustic_modeling.py | 33 +- tests/test_commandline_adapt.py | 12 +- tests/test_commandline_align.py | 42 +- tests/test_commandline_configure.py | 1 + tests/test_commandline_create_segments.py | 6 +- tests/test_commandline_diarize_speakers.py | 12 +- tests/test_commandline_g2p.py | 14 +- tests/test_commandline_lm.py | 15 +- tests/test_commandline_model.py | 2 +- tests/test_commandline_tokenize.py | 92 ++++ tests/test_commandline_train.py | 6 +- tests/test_commandline_train_dict.py | 6 +- tests/test_commandline_train_ivector.py | 1 + tests/test_commandline_transcribe.py | 12 +- tests/test_commandline_validate.py | 5 + tests/test_corpus.py | 52 +-- tests/test_dict.py | 21 +- tests/test_gui.py | 8 +- 84 files changed, 2759 insertions(+), 1495 deletions(-) create mode 100644 docs/source/reference/tokenization/helper.rst create mode 100644 docs/source/reference/tokenization/index.rst create mode 100644 docs/source/reference/tokenization/tokenizer.rst create mode 100644 docs/source/reference/tokenization/training.rst create mode 100644 docs/source/user_guide/corpus_creation/tokenize.rst create mode 100644 docs/source/user_guide/corpus_creation/train_tokenizer.rst create mode 100644 montreal_forced_aligner/command_line/tokenize.py create mode 100644 montreal_forced_aligner/command_line/train_tokenizer.py create mode 100644 montreal_forced_aligner/tokenization/__init__.py create mode 100644 montreal_forced_aligner/tokenization/tokenizer.py create mode 100644 montreal_forced_aligner/tokenization/trainer.py create mode 100644 tests/data/tokenizer/test_tokenizer_model.zip create mode 100644 tests/test_commandline_tokenize.py diff --git a/docs/source/reference/tokenization/helper.rst b/docs/source/reference/tokenization/helper.rst new file mode 100644 index 00000000..e560aad6 --- /dev/null +++ b/docs/source/reference/tokenization/helper.rst @@ -0,0 +1,14 @@ +Helper functionality +==================== + +Helper +------ + +.. currentmodule:: montreal_forced_aligner.tokenization.tokenizer + +.. autosummary:: + :toctree: generated/ + + TokenizerRewriter + TokenizerArguments + TokenizerFunction diff --git a/docs/source/reference/tokenization/index.rst b/docs/source/reference/tokenization/index.rst new file mode 100644 index 00000000..51fc207b --- /dev/null +++ b/docs/source/reference/tokenization/index.rst @@ -0,0 +1,20 @@ + +.. _tokenization_api: + +Tokenizers +========== + +Tokenizers allow for adding spaces as word boundaries for orthographic systems that don't normally use them (i.e., Japanese, Chinese, Thai). + +.. currentmodule:: montreal_forced_aligner.models + +.. autosummary:: + :toctree: generated/ + + TokenizerModel + +.. toctree:: + + training + tokenizer + helper diff --git a/docs/source/reference/tokenization/tokenizer.rst b/docs/source/reference/tokenization/tokenizer.rst new file mode 100644 index 00000000..6088a838 --- /dev/null +++ b/docs/source/reference/tokenization/tokenizer.rst @@ -0,0 +1,13 @@ + +.. _tokenizer_api: + +Corpus tokenizer +================= + +.. currentmodule:: montreal_forced_aligner.tokenization.tokenizer + +.. autosummary:: + :toctree: generated/ + + CorpusTokenizer + TokenizerValidator diff --git a/docs/source/reference/tokenization/training.rst b/docs/source/reference/tokenization/training.rst new file mode 100644 index 00000000..e19094b4 --- /dev/null +++ b/docs/source/reference/tokenization/training.rst @@ -0,0 +1,12 @@ + +.. _tokenizer_model_training_api: + +Training tokenizer models +========================= + +.. currentmodule:: montreal_forced_aligner.tokenization.trainer + +.. autosummary:: + :toctree: generated/ + + TokenizerTrainer -- Trainer for language model on text corpora diff --git a/docs/source/reference/top_level_index.rst b/docs/source/reference/top_level_index.rst index 7fc04b83..8fa41665 100644 --- a/docs/source/reference/top_level_index.rst +++ b/docs/source/reference/top_level_index.rst @@ -9,3 +9,4 @@ Workflows transcription/index segmentation/index diarization/index + tokenization/index diff --git a/docs/source/user_guide/corpus_creation/index.rst b/docs/source/user_guide/corpus_creation/index.rst index fafa019a..22cb4a20 100644 --- a/docs/source/user_guide/corpus_creation/index.rst +++ b/docs/source/user_guide/corpus_creation/index.rst @@ -24,4 +24,6 @@ MFA now contains several command line utilities for helping to create corpora fr transcribing training_lm training_dictionary + tokenize + train_tokenizer anchor diff --git a/docs/source/user_guide/corpus_creation/tokenize.rst b/docs/source/user_guide/corpus_creation/tokenize.rst new file mode 100644 index 00000000..3010f65e --- /dev/null +++ b/docs/source/user_guide/corpus_creation/tokenize.rst @@ -0,0 +1,20 @@ + +.. _tokenize_cli: + +Tokenize utterances ``(mfa tokenize)`` +========================================= + +Use a model trained from :ref:`train_tokenizer_cli` to tokenize a corpus (i.e. insert spaces as word boundaries for orthographic systems that do not require them). + +Command reference +----------------- + +.. click:: montreal_forced_aligner.command_line.tokenize:tokenize_cli + :prog: mfa tokenize + :nested: full + + +API reference +------------- + +- :ref:`tokenization_api` diff --git a/docs/source/user_guide/corpus_creation/train_tokenizer.rst b/docs/source/user_guide/corpus_creation/train_tokenizer.rst new file mode 100644 index 00000000..6a88722c --- /dev/null +++ b/docs/source/user_guide/corpus_creation/train_tokenizer.rst @@ -0,0 +1,24 @@ + +.. _train_tokenizer_cli: + +Train a word tokenizer ``(mfa train_tokenizer)`` +================================================ + +Training a tokenizer uses a simplified sequence-to-sequence model like G2P, but with the following differences: + +* Both the input and output symbols are graphemes +* Symbols can only output themselves +* Only allow for inserting space characters + +Command reference +----------------- + +.. click:: montreal_forced_aligner.command_line.train_tokenizer:train_tokenizer_cli + :prog: mfa train_tokenizer + :nested: full + + +API reference +------------- + +- :ref:`tokenization_api` diff --git a/montreal_forced_aligner/abc.py b/montreal_forced_aligner/abc.py index 9e8c9c22..094a3a26 100644 --- a/montreal_forced_aligner/abc.py +++ b/montreal_forced_aligner/abc.py @@ -144,7 +144,7 @@ def data_source_identifier(self) -> str: @property @abc.abstractmethod - def output_directory(self) -> str: + def output_directory(self) -> Path: """Root temporary directory""" ... @@ -153,52 +153,52 @@ def clean_working_directory(self) -> None: shutil.rmtree(self.output_directory, ignore_errors=True) @property - def corpus_output_directory(self) -> str: + def corpus_output_directory(self) -> Path: """Temporary directory containing all corpus information""" if self._corpus_output_directory: return self._corpus_output_directory - return os.path.join(self.output_directory, f"{self.data_source_identifier}") + return self.output_directory.joinpath(f"{self.data_source_identifier}") @corpus_output_directory.setter - def corpus_output_directory(self, directory: str) -> None: + def corpus_output_directory(self, directory: Path) -> None: self._corpus_output_directory = directory @property - def dictionary_output_directory(self) -> str: + def dictionary_output_directory(self) -> Path: """Temporary directory containing all dictionary information""" if self._dictionary_output_directory: return self._dictionary_output_directory - return os.path.join(self.output_directory, "dictionary") + return self.output_directory.joinpath("dictionary") @property - def model_output_directory(self) -> str: + def model_output_directory(self) -> Path: """Temporary directory containing all dictionary information""" - return os.path.join(self.output_directory, "models") + return self.output_directory.joinpath("models") @dictionary_output_directory.setter - def dictionary_output_directory(self, directory: str) -> None: + def dictionary_output_directory(self, directory: Path) -> None: self._dictionary_output_directory = directory @property - def language_model_output_directory(self) -> str: + def language_model_output_directory(self) -> Path: """Temporary directory containing all dictionary information""" if self._language_model_output_directory: return self._language_model_output_directory - return os.path.join(self.model_output_directory, "language_model") + return self.model_output_directory.joinpath("language_model") @language_model_output_directory.setter - def language_model_output_directory(self, directory: str) -> None: + def language_model_output_directory(self, directory: Path) -> None: self._language_model_output_directory = directory @property - def acoustic_model_output_directory(self) -> str: + def acoustic_model_output_directory(self) -> Path: """Temporary directory containing all dictionary information""" if self._acoustic_model_output_directory: return self._acoustic_model_output_directory - return os.path.join(self.model_output_directory, "acoustic_model") + return self.model_output_directory.joinpath("acoustic_model") @acoustic_model_output_directory.setter - def acoustic_model_output_directory(self, directory: str) -> None: + def acoustic_model_output_directory(self, directory: Path) -> None: self._acoustic_model_output_directory = directory @@ -459,18 +459,18 @@ def configuration(self) -> MetaDict: @property @abc.abstractmethod - def working_directory(self) -> str: + def working_directory(self) -> Path: """Current working directory""" ... @property - def working_log_directory(self) -> str: + def working_log_directory(self) -> Path: """Current working log directory""" - return os.path.join(self.working_directory, "log") + return self.working_directory.joinpath("log") @property @abc.abstractmethod - def data_directory(self) -> str: + def data_directory(self) -> Path: """Data directory""" ... @@ -577,7 +577,7 @@ def parse_args( @classmethod def parse_parameters( cls, - config_path: Optional[str] = None, + config_path: Optional[Path] = None, args: Optional[Dict[str, Any]] = None, unknown_args: Optional[typing.Iterable[str]] = None, ) -> MetaDict: @@ -586,7 +586,7 @@ def parse_parameters( Parameters ---------- - config_path: str, optional + config_path: :class:`~pathlib.Path`, optional Path to yaml configuration file args: dict[str, Any] Parsed arguments @@ -777,13 +777,13 @@ def meta(self) -> MetaDict: ... @abc.abstractmethod - def export_model(self, output_model_path: str) -> None: + def export_model(self, output_model_path: Path) -> None: """ Abstract method to export an MFA model Parameters ---------- - output_model_path: str + output_model_path: :class:`~pathlib.Path` Path to export model """ ... diff --git a/montreal_forced_aligner/acoustic_modeling/base.py b/montreal_forced_aligner/acoustic_modeling/base.py index f4630fc5..a15041bb 100644 --- a/montreal_forced_aligner/acoustic_modeling/base.py +++ b/montreal_forced_aligner/acoustic_modeling/base.py @@ -8,6 +8,7 @@ import subprocess import time from abc import abstractmethod +from pathlib import Path from queue import Empty from typing import TYPE_CHECKING, List @@ -261,40 +262,40 @@ def acoustic_model_training_params(self) -> MetaDict: } @property - def working_directory(self) -> str: + def working_directory(self) -> Path: """Training directory""" - return os.path.join(self.worker.output_directory, self.identifier) + return self.worker.output_directory.joinpath(self.identifier) @property - def working_log_directory(self) -> str: + def working_log_directory(self) -> Path: """Training log directory""" - return os.path.join(self.working_directory, "log") + return self.working_directory.joinpath("log") @property - def model_path(self) -> str: + def model_path(self) -> Path: """Current acoustic model path""" if self.workflow.done: return self.next_model_path - return os.path.join(self.working_directory, f"{self.iteration}.mdl") + return self.working_directory.joinpath(f"{self.iteration}.mdl") @property - def alignment_model_path(self) -> str: + def alignment_model_path(self) -> Path: """Alignment model path""" return self.model_path @property - def next_model_path(self) -> str: + def next_model_path(self) -> Path: """Next iteration's acoustic model path""" if self.workflow.done: - return os.path.join(self.working_directory, "final.mdl") - return os.path.join(self.working_directory, f"{self.iteration + 1}.mdl") + return self.working_directory.joinpath("final.mdl") + return self.working_directory.joinpath(f"{self.iteration + 1}.mdl") @property - def next_occs_path(self) -> str: + def next_occs_path(self) -> Path: """Next iteration's occs file path""" if self.workflow.done: - return os.path.join(self.working_directory, "final.occs") - return os.path.join(self.working_directory, f"{self.iteration + 1}.occs") + return self.working_directory.joinpath("final.occs") + return self.working_directory.joinpath(f"{self.iteration + 1}.occs") @abstractmethod def compute_calculated_properties(self) -> None: @@ -365,7 +366,7 @@ def acc_stats(self) -> None: for num_utterances, errors in function.run(): pbar.update(num_utterances + errors) - log_path = os.path.join(self.working_log_directory, f"update.{self.iteration}.log") + log_path = self.working_log_directory.joinpath(f"update.{self.iteration}.log") with mfa_open(log_path, "w") as log_file: acc_files = [] for a in arguments: @@ -449,9 +450,9 @@ def align_iteration(self) -> None: @property def initialized(self) -> bool: return ( - os.path.exists(os.path.join(self.working_directory, "1.mdl")) - or os.path.exists(os.path.join(self.working_directory, "final.mdl")) - or os.path.exists(os.path.join(self.working_directory, "done")) + os.path.exists(self.working_directory.joinpath("1.mdl")) + or os.path.exists(self.working_directory.joinpath("final.mdl")) + or os.path.exists(self.working_directory.joinpath("done")) ) def train_iteration(self) -> None: @@ -507,9 +508,9 @@ def train(self) -> None: logger.debug(f"Training took {time.time() - begin:.3f} seconds") @property - def exported_model_path(self) -> str: + def exported_model_path(self) -> Path: """Model path to export to once training is complete""" - return os.path.join(self.working_log_directory, "acoustic_model.zip") + return self.working_log_directory.joinpath("acoustic_model.zip") def finalize_training(self) -> None: """ @@ -518,36 +519,36 @@ def finalize_training(self) -> None: """ os.rename( - os.path.join(self.working_directory, f"{self.num_iterations+1}.mdl"), - os.path.join(self.working_directory, "final.mdl"), + self.working_directory.joinpath(f"{self.num_iterations+1}.mdl"), + self.working_directory.joinpath("final.mdl"), ) - final_occs_path = os.path.join(self.working_directory, "final.occs") + final_occs_path = self.working_directory.joinpath("final.occs") if not os.path.exists(final_occs_path): os.rename( - os.path.join(self.working_directory, f"{self.num_iterations+1}.occs"), + self.working_directory.joinpath(f"{self.num_iterations+1}.occs"), final_occs_path, ) - ali_model_path = os.path.join(self.working_directory, f"{self.num_iterations+1}.alimdl") + ali_model_path = self.working_directory.joinpath(f"{self.num_iterations+1}.alimdl") if os.path.exists(ali_model_path): os.rename( ali_model_path, - os.path.join(self.working_directory, "final.alimdl"), + self.working_directory.joinpath("final.alimdl"), ) self.export_model(self.exported_model_path) if not GLOBAL_CONFIG.debug: for i in range(1, self.num_iterations + 1): - model_path = os.path.join(self.working_directory, f"{i}.mdl") + model_path = self.working_directory.joinpath(f"{i}.mdl") try: os.remove(model_path) except FileNotFoundError: pass try: - os.remove(os.path.join(self.working_directory, f"{i}.occs")) + os.remove(self.working_directory.joinpath(f"{i}.occs")) except FileNotFoundError: pass for file in os.listdir(self.working_directory): if any(file.startswith(x) for x in ["fsts.", "trans.", "ali."]): - os.remove(os.path.join(self.working_directory, file)) + os.remove(self.working_directory.joinpath(file)) wf = self.worker.current_workflow with self.session() as session: session.query(CorpusWorkflow).filter(CorpusWorkflow.id == wf.id).update({"done": True}) @@ -626,7 +627,7 @@ def meta(self) -> MetaDict: } return data - def export_model(self, output_model_path: str) -> None: + def export_model(self, output_model_path: Path) -> None: """ Export an acoustic model to the specified path @@ -635,9 +636,11 @@ def export_model(self, output_model_path: str) -> None: output_model_path : str Path to save acoustic model """ - directory, filename = os.path.split(output_model_path) - basename, _ = os.path.splitext(filename) - acoustic_model = AcousticModel.empty(basename, root_directory=self.working_log_directory) + directory = output_model_path.parent + + acoustic_model = AcousticModel.empty( + output_model_path.stem, root_directory=self.working_log_directory + ) acoustic_model.add_meta_file(self) acoustic_model.add_model(self.working_directory) acoustic_model.add_pronunciation_models( diff --git a/montreal_forced_aligner/acoustic_modeling/lda.py b/montreal_forced_aligner/acoustic_modeling/lda.py index fe5fab43..21f0be15 100644 --- a/montreal_forced_aligner/acoustic_modeling/lda.py +++ b/montreal_forced_aligner/acoustic_modeling/lda.py @@ -8,6 +8,7 @@ import shutil import subprocess import typing +from pathlib import Path from queue import Empty from typing import TYPE_CHECKING, Dict, List @@ -45,10 +46,10 @@ class LdaAccStatsArguments(MfaArguments): dictionaries: List[str] feature_strings: Dict[str, str] - ali_paths: Dict[str, str] - model_path: str + ali_paths: Dict[str, Path] + model_path: Path lda_options: MetaDict - acc_paths: Dict[str, str] + acc_paths: Dict[str, Path] class CalcLdaMlltArguments(MfaArguments): @@ -56,10 +57,10 @@ class CalcLdaMlltArguments(MfaArguments): dictionaries: List[str] feature_strings: Dict[str, str] - ali_paths: Dict[str, str] - model_path: str + ali_paths: Dict[str, Path] + model_path: Path lda_options: MetaDict - macc_paths: Dict[str, str] + macc_paths: Dict[str, Path] class LdaAccStatsFunction(KaldiFunction): @@ -318,7 +319,7 @@ def lda_acc_stats_arguments(self) -> List[LdaAccStatsArguments]: LdaAccStatsArguments( j.id, getattr(self, "db_string", ""), - os.path.join(self.working_log_directory, f"lda_acc_stats.{j.id}.log"), + self.working_log_directory.joinpath(f"lda_acc_stats.{j.id}.log"), j.dictionary_ids, feat_strings, j.construct_path_dictionary( @@ -407,7 +408,7 @@ def lda_acc_stats(self) -> None: """ worker_lda_path = os.path.join(self.worker.working_directory, "lda.mat") - lda_path = os.path.join(self.working_directory, "lda.mat") + lda_path = self.working_directory.joinpath("lda.mat") if os.path.exists(worker_lda_path): os.remove(worker_lda_path) arguments = self.lda_acc_stats_arguments() @@ -450,7 +451,7 @@ def lda_acc_stats(self) -> None: for done, errors in function.run(): pbar.update(done + errors) - log_path = os.path.join(self.working_log_directory, "lda_est.log") + log_path = self.working_log_directory.joinpath("lda_est.log") acc_list = [] for x in arguments: acc_list.extend(x.acc_paths.values()) @@ -549,9 +550,9 @@ def calc_lda_mllt(self) -> None: log_path = os.path.join( self.working_log_directory, f"transform_means.{self.iteration}.log" ) - previous_mat_path = os.path.join(self.working_directory, "lda.mat") - new_mat_path = os.path.join(self.working_directory, "lda_new.mat") - composed_path = os.path.join(self.working_directory, "lda_composed.mat") + previous_mat_path = self.working_directory.joinpath("lda.mat") + new_mat_path = self.working_directory.joinpath("lda_new.mat") + composed_path = self.working_directory.joinpath("lda_composed.mat") with mfa_open(log_path, "a") as log_file: macc_list = [] for x in arguments: diff --git a/montreal_forced_aligner/acoustic_modeling/monophone.py b/montreal_forced_aligner/acoustic_modeling/monophone.py index 21893db6..44185a22 100644 --- a/montreal_forced_aligner/acoustic_modeling/monophone.py +++ b/montreal_forced_aligner/acoustic_modeling/monophone.py @@ -7,6 +7,7 @@ import re import subprocess import typing +from pathlib import Path from queue import Empty import tqdm @@ -32,7 +33,7 @@ class MonoAlignEqualArguments(MfaArguments): """Arguments for :func:`~montreal_forced_aligner.acoustic_modeling.monophone.MonoAlignEqualFunction`""" - model_path: str + model_path: Path feature_options: MetaDict @@ -180,7 +181,7 @@ def mono_align_equal_arguments(self) -> typing.List[MonoAlignEqualArguments]: MonoAlignEqualArguments( j.id, getattr(self, "db_string", ""), - os.path.join(self.working_log_directory, f"mono_align_equal.{j.id}.log"), + self.working_log_directory.joinpath(f"mono_align_equal.{j.id}.log"), self.model_path, self.feature_options, ) @@ -286,7 +287,7 @@ def mono_align_equal(self) -> None: for num_utterances, errors in function.run(): pbar.update(num_utterances + errors) - log_path = os.path.join(self.working_log_directory, "update.0.log") + log_path = self.working_log_directory.joinpath("update.0.log") with mfa_open(log_path, "w") as log_file: acc_files = [] for j in self.jobs: @@ -324,7 +325,7 @@ def _trainer_initialization(self) -> None: if self.initialized: return self.iteration = 0 - tree_path = os.path.join(self.working_directory, "tree") + tree_path = self.working_directory.joinpath("tree") feat_dim = self.worker.get_feat_dim() feature_string = self.jobs[0].construct_feature_proc_string( @@ -336,8 +337,8 @@ def _trainer_initialization(self) -> None: self.feature_options["uses_speaker_adaptation"], ) shared_phones_path = os.path.join(self.worker.phones_dir, "sets.int") - init_log_path = os.path.join(self.working_log_directory, "init.log") - temp_feats_path = os.path.join(self.working_directory, "temp_feats") + init_log_path = self.working_log_directory.joinpath("init.log") + temp_feats_path = self.working_directory.joinpath("temp_feats") with mfa_open(init_log_path, "w") as log_file: subprocess.call( [ diff --git a/montreal_forced_aligner/acoustic_modeling/pronunciation_probabilities.py b/montreal_forced_aligner/acoustic_modeling/pronunciation_probabilities.py index 3f7bf885..845ff702 100644 --- a/montreal_forced_aligner/acoustic_modeling/pronunciation_probabilities.py +++ b/montreal_forced_aligner/acoustic_modeling/pronunciation_probabilities.py @@ -6,6 +6,7 @@ import shutil import time import typing +from pathlib import Path import tqdm from sqlalchemy.orm import joinedload @@ -72,47 +73,47 @@ def _trainer_initialization(self) -> None: pass @property - def exported_model_path(self) -> str: + def exported_model_path(self) -> Path: """Path to exported acoustic model""" return self.previous_trainer.exported_model_path @property - def model_path(self) -> str: + def model_path(self) -> Path: """Current acoustic model path""" - return os.path.join(self.working_directory, "final.mdl") + return self.working_directory.joinpath("final.mdl") @property - def alignment_model_path(self) -> str: + def alignment_model_path(self) -> Path: """Alignment model path""" - path = self.model_path.replace(".mdl", ".alimdl") + path = self.model_path.with_suffix(".alimdl") if os.path.exists(path): return path return self.model_path @property - def phone_symbol_table_path(self) -> str: + def phone_symbol_table_path(self) -> Path: """Worker's phone symbol table""" return self.worker.phone_symbol_table_path @property - def grapheme_symbol_table_path(self) -> str: + def grapheme_symbol_table_path(self) -> Path: """Worker's grapheme symbol table""" return self.worker.grapheme_symbol_table_path @property - def input_path(self) -> str: + def input_path(self) -> Path: """Path to temporary file to store training data""" - return os.path.join(self.working_directory, f"input_{self._data_source}.txt") + return self.working_directory.joinpath(f"input_{self._data_source}.txt") @property - def output_path(self) -> str: + def output_path(self) -> Path: """Path to temporary file to store training data""" - return os.path.join(self.working_directory, f"output_{self._data_source}.txt") + return self.working_directory.joinpath(f"output_{self._data_source}.txt") @property - def output_alignment_path(self) -> str: + def output_alignment_path(self) -> Path: """Path to temporary file to store training data""" - return os.path.join(self.working_directory, f"output_{self._data_source}_alignment.txt") + return self.working_directory.joinpath(f"output_{self._data_source}_alignment.txt") def generate_pronunciations_arguments(self) -> typing.List[GeneratePronunciationsArguments]: """ @@ -128,7 +129,7 @@ def generate_pronunciations_arguments(self) -> typing.List[GeneratePronunciation GeneratePronunciationsArguments( j.id, getattr(self, "db_string", ""), - os.path.join(self.working_log_directory, f"generate_pronunciations.{j.id}.log"), + self.working_log_directory.joinpath(f"generate_pronunciations.{j.id}.log"), self.model_path, True, ) @@ -208,11 +209,11 @@ def train_g2p_lexicon(self) -> None: os.makedirs(self.working_log_directory, exist_ok=True) dictionaries = session.query(Dictionary) shutil.copyfile( - self.phone_symbol_table_path, os.path.join(self.working_directory, "phones.txt") + self.phone_symbol_table_path, self.working_directory.joinpath("phones.txt") ) shutil.copyfile( self.grapheme_symbol_table_path, - os.path.join(self.working_directory, "graphemes.txt"), + self.working_directory.joinpath("graphemes.txt"), ) self.input_token_type = self.grapheme_symbol_table_path self.output_token_type = self.phone_symbol_table_path @@ -233,7 +234,7 @@ def train_g2p_lexicon(self) -> None: logger.debug( f"Generating model for {d.name} took {time.time() - begin:.3f} seconds" ) - os.rename(d.lexicon_fst_path, d.lexicon_fst_path + ".backup") + os.rename(d.lexicon_fst_path, d.lexicon_fst_path.with_suffix(".backup")) os.rename(self.fst_path, d.lexicon_fst_path) if not GLOBAL_CONFIG.current_profile.debug: @@ -242,7 +243,7 @@ def train_g2p_lexicon(self) -> None: os.remove(self.output_far_path) for f in os.listdir(self.working_directory): if any(f.endswith(x) for x in [".fst", ".like", ".far", ".enc"]): - os.remove(os.path.join(self.working_directory, f)) + os.remove(self.working_directory.joinpath(f)) begin = time.time() self.align_g2p(self.output_alignment_path) @@ -254,7 +255,7 @@ def train_g2p_lexicon(self) -> None: logger.debug( f"Generating model for {d.name} took {time.time() - begin:.3f} seconds" ) - os.rename(d.align_lexicon_path, d.align_lexicon_path + ".backup") + os.rename(d.align_lexicon_path, d.align_lexicon_path.with_suffix(".backup")) os.rename(self.fst_path, d.align_lexicon_path) if not GLOBAL_CONFIG.current_profile.debug: os.remove(self.output_alignment_path) @@ -263,12 +264,12 @@ def train_g2p_lexicon(self) -> None: os.remove(self.output_far_path) for f in os.listdir(self.working_directory): if any(f.endswith(x) for x in [".fst", ".like", ".far", ".enc"]): - os.remove(os.path.join(self.working_directory, f)) + os.remove(self.working_directory.joinpath(f)) d.use_g2p = True session.commit() self.worker.use_g2p = True - def export_model(self, output_model_path: str) -> None: + def export_model(self, output_model_path: Path) -> None: """ Export an acoustic model to the specified path @@ -284,11 +285,11 @@ def setup(self): previous_directory = self.previous_aligner.working_directory for j in self.jobs: for p in j.construct_path_dictionary(previous_directory, "ali", "ark").values(): - shutil.copy(p, p.replace(previous_directory, wf.working_directory)) + shutil.copy(p, wf.working_directory.joinpath(p.name)) for f in ["final.mdl", "final.alimdl", "final.occs", "lda.mat"]: - p = os.path.join(previous_directory, f) + p = previous_directory.joinpath(f) if os.path.exists(p): - shutil.copy(p, p.replace(previous_directory, wf.working_directory)) + shutil.copy(p, wf.working_directory.joinpath(p.name)) def train_pronunciation_probabilities(self) -> None: """ @@ -310,7 +311,7 @@ def train_pronunciation_probabilities(self) -> None: self.working_directory, f"{self.worker.dictionary_base_names[d.id]}.fst", ) - os.rename(d.lexicon_fst_path, d.lexicon_fst_path + ".backup") + os.rename(d.lexicon_fst_path, d.lexicon_fst_path.with_suffix(".backup")) shutil.copy(fst_path, d.lexicon_fst_path) d.use_g2p = True session.commit() @@ -333,7 +334,7 @@ def train_pronunciation_probabilities(self) -> None: .filter(Word.dictionary_id == d.id) ) cache = {(x.word.word, x.pronunciation): x for x in pronunciations} - new_dictionary_path = os.path.join(self.working_directory, f"{d.id}.dict") + new_dictionary_path = self.working_directory.joinpath(f"{d.id}.dict") for ( word, pron, @@ -388,7 +389,7 @@ def train_pronunciation_probabilities(self) -> None: self.worker.write_lexicon_information() with self.worker.session() as session: for d in session.query(Dictionary): - dict_path = os.path.join(self.working_directory, f"{d.id}.dict") + dict_path = self.working_directory.joinpath(f"{d.id}.dict") self.worker.export_trained_rules(self.working_directory) self.worker.export_lexicon( d.id, diff --git a/montreal_forced_aligner/acoustic_modeling/sat.py b/montreal_forced_aligner/acoustic_modeling/sat.py index 302c9b18..8706d8a4 100644 --- a/montreal_forced_aligner/acoustic_modeling/sat.py +++ b/montreal_forced_aligner/acoustic_modeling/sat.py @@ -9,6 +9,7 @@ import subprocess import time import typing +from pathlib import Path from queue import Empty from typing import Dict, List @@ -38,9 +39,9 @@ class AccStatsTwoFeatsArguments(MfaArguments): """Arguments for :func:`~montreal_forced_aligner.acoustic_modeling.sat.AccStatsTwoFeatsFunction`""" dictionaries: List[str] - ali_paths: Dict[str, str] - acc_paths: Dict[str, str] - model_path: str + ali_paths: Dict[str, Path] + acc_paths: Dict[str, Path] + model_path: Path feature_strings: Dict[str, str] si_feature_strings: Dict[str, str] @@ -193,7 +194,7 @@ def acc_stats_two_feats_arguments(self) -> List[AccStatsTwoFeatsArguments]: AccStatsTwoFeatsArguments( j.id, getattr(self, "db_string", ""), - os.path.join(self.working_log_directory, f"acc_stats_two_feats.{j.id}.log"), + self.working_log_directory.joinpath(f"acc_stats_two_feats.{j.id}.log"), j.dictionary_ids, j.construct_path_dictionary(self.working_directory, "ali", "ark"), j.construct_path_dictionary(self.working_directory, "two_feat_acc", "ark"), @@ -233,7 +234,7 @@ def _trainer_initialization(self) -> None: if os.path.exists(os.path.join(self.previous_aligner.working_directory, "lda.mat")): shutil.copyfile( os.path.join(self.previous_aligner.working_directory, "lda.mat"), - os.path.join(self.working_directory, "lda.mat"), + self.working_directory.joinpath("lda.mat"), ) for j in self.jobs: for path in j.construct_path_dictionary( @@ -281,8 +282,8 @@ def finalize_training(self) -> None: self.create_align_model() self.uses_speaker_adaptation = True super().finalize_training() - assert self.alignment_model_path.endswith("final.alimdl") - assert os.path.exists(self.alignment_model_path) + assert self.alignment_model_path.name == "final.alimdl" + assert self.alignment_model_path.exists() except Exception as e: if isinstance(e, KaldiProcessingError): log_kaldi_errors(e.error_logs) @@ -311,9 +312,9 @@ def train_iteration(self) -> None: self.iteration += 1 @property - def alignment_model_path(self) -> str: + def alignment_model_path(self) -> Path: """Alignment model path""" - path = self.model_path.replace(".mdl", ".alimdl") + path = self.model_path.with_suffix(".alimdl") if os.path.exists(path): return path return self.model_path @@ -378,7 +379,7 @@ def create_align_model(self) -> None: for _ in function.run(): pbar.update(1) - log_path = os.path.join(self.working_log_directory, "align_model_est.log") + log_path = self.working_log_directory.joinpath("align_model_est.log") with mfa_open(log_path, "w") as log_file: acc_files = [] @@ -397,14 +398,12 @@ def create_align_model(self) -> None: if not self.quick: est_command.append(f"--power={self.power}") else: - est_command.append( - f"--write-occs={os.path.join(self.working_directory, 'final.occs')}" - ) + est_command.append(f"--write-occs={self.working_directory.joinpath('final.occs')}") est_command.extend( [ self.model_path, "-", - self.model_path.replace(".mdl", ".alimdl"), + self.model_path.with_suffix(".alimdl"), ] ) est_proc = subprocess.Popen( diff --git a/montreal_forced_aligner/acoustic_modeling/trainer.py b/montreal_forced_aligner/acoustic_modeling/trainer.py index 0fc861ba..e00c6585 100644 --- a/montreal_forced_aligner/acoustic_modeling/trainer.py +++ b/montreal_forced_aligner/acoustic_modeling/trainer.py @@ -54,7 +54,7 @@ class TransitionAccArguments(MfaArguments): """Arguments for :class:`~montreal_forced_aligner.acoustic_modeling.trainer.TransitionAccFunction`""" - model_path: str + model_path: Path class TransitionAccFunction(KaldiFunction): @@ -254,7 +254,7 @@ def default_training_configurations(cls) -> List[Tuple[str, Dict[str, Any]]]: @classmethod def parse_parameters( cls, - config_path: Optional[str] = None, + config_path: Optional[Path] = None, args: Optional[Dict[str, Any]] = None, unknown_args: Optional[typing.Iterable[str]] = None, ) -> MetaDict: @@ -263,7 +263,7 @@ def parse_parameters( Parameters ---------- - config_path: str, optional + config_path: :class:`~pathlib.Path`, optional Path to yaml configuration file args: dict[str, Any] Parsed arguments @@ -429,7 +429,7 @@ def add_config(self, train_type: str, params: MetaDict) -> None: self.training_configs[identifier] = config - def export_model(self, output_model_path: str) -> None: + def export_model(self, output_model_path: Path) -> None: """ Export an acoustic model to the specified path @@ -540,7 +540,7 @@ def train(self) -> None: self.acoustic_model = AcousticModel(previous.exported_model_path, self.working_directory) self.align() self.finalize_training() - counts_path = os.path.join(self.working_directory, "phone_pdf.counts") + counts_path = self.working_directory.joinpath("phone_pdf.counts") new_counts_path = os.path.join(previous.working_directory, "phone_pdf.counts") if not os.path.exists(new_counts_path): shutil.copyfile(counts_path, new_counts_path) @@ -565,7 +565,7 @@ def transition_acc_arguments(self) -> List[TransitionAccArguments]: TransitionAccArguments( j.id, getattr(self, "db_string", ""), - os.path.join(self.working_log_directory, f"test_utterances.{j.id}.log"), + self.working_log_directory.joinpath(f"test_utterances.{j.id}.log"), self.model_path, ) for j in self.jobs @@ -630,7 +630,7 @@ def compute_phone_pdf_counts(self) -> None: thirdparty_binary("vector-sum"), "--binary=false", *t_accs, - os.path.join(self.working_directory, "final.tacc"), + self.working_directory.joinpath("final.tacc"), ], stderr=subprocess.DEVNULL, ) @@ -669,7 +669,7 @@ def compute_phone_pdf_counts(self) -> None: if m: transition_id = int(m.group("transition_id")) phone_pdfs[transition_id] = (phone, pdf) - with mfa_open(os.path.join(self.working_directory, "final.tacc"), "r") as f: + with mfa_open(self.working_directory.joinpath("final.tacc"), "r") as f: data = f.read().strip().split()[1:-1] transition_counts = { @@ -683,7 +683,7 @@ def compute_phone_pdf_counts(self) -> None: pdf_counts[pdf] += transition_counts[transition_id] pdf_phone_counts[(phone, pdf)] += transition_counts[transition_id] phone_pdf_mapping[phone][pdf] += transition_counts[transition_id] - with mfa_open(os.path.join(self.working_directory, "phone_pdf.counts"), "w") as f: + with mfa_open(self.working_directory.joinpath("phone_pdf.counts"), "w") as f: json.dump(phone_pdf_mapping, f, ensure_ascii=False) logger.debug(f"Accumulating transition stats took {time.time() - begin:.3f} seconds") logger.info("Finished accumulating transition stats!") @@ -710,7 +710,7 @@ def export_files( Parameters ---------- - output_directory: Path + output_directory: :class:`~pathlib.Path` Directory to save to output_format: str, optional Format to save alignments, one of 'long_textgrids' (the default), 'short_textgrids', or 'json', passed to praatio @@ -769,10 +769,10 @@ def align(self) -> None: if not os.path.exists(path): missing_transforms = True if missing_transforms: - assert self.alignment_model_path.endswith(".alimdl") + assert self.alignment_model_path.suffix == ".alimdl" self.calc_fmllr() self.uses_speaker_adaptation = True - assert self.alignment_model_path.endswith(".mdl") + assert self.alignment_model_path.suffix == ".mdl" self.align_utterances() if self.current_subset: logger.debug( @@ -800,19 +800,19 @@ def align(self) -> None: raise @property - def alignment_model_path(self) -> str: + def alignment_model_path(self) -> Path: """Current alignment model path""" - path = os.path.join(self.working_directory, "final.alimdl") + path = self.working_directory.joinpath("final.alimdl") if os.path.exists(path) and not self.uses_speaker_adaptation: return path return self.model_path @property - def model_path(self) -> str: + def model_path(self) -> Path: """Current model path""" if self.current_trainer is not None: return self.current_trainer.model_path - return os.path.join(self.working_directory, "final.mdl") + return self.working_directory.joinpath("final.mdl") @property def data_directory(self) -> str: @@ -820,15 +820,15 @@ def data_directory(self) -> str: return self.subset_directory(self.current_subset) @property - def working_directory(self) -> Optional[str]: + def working_directory(self) -> Optional[Path]: """Working directory""" if self.current_trainer is not None and not self.current_trainer.training_complete: return self.current_trainer.working_directory if self.current_aligner is None: return None - return os.path.join(self.output_directory, f"{self.current_aligner.identifier}_ali") + return self.output_directory.joinpath(f"{self.current_aligner.identifier}_ali") @property def working_log_directory(self) -> Optional[str]: """Current log directory""" - return os.path.join(self.working_directory, "log") + return self.working_directory.joinpath("log") diff --git a/montreal_forced_aligner/acoustic_modeling/triphone.py b/montreal_forced_aligner/acoustic_modeling/triphone.py index 7ce47280..420798c3 100644 --- a/montreal_forced_aligner/acoustic_modeling/triphone.py +++ b/montreal_forced_aligner/acoustic_modeling/triphone.py @@ -7,6 +7,7 @@ import re import subprocess import typing +from pathlib import Path from queue import Empty from typing import TYPE_CHECKING, Dict, List @@ -45,21 +46,21 @@ class TreeStatsArguments(MfaArguments): dictionaries: List[str] ci_phones: str - model_path: str + model_path: Path feature_strings: Dict[str, str] - ali_paths: Dict[str, str] - treeacc_paths: Dict[str, str] + ali_paths: Dict[str, Path] + treeacc_paths: Dict[str, Path] class ConvertAlignmentsArguments(MfaArguments): """Arguments for :func:`~montreal_forced_aligner.acoustic_modeling.triphone.ConvertAlignmentsFunction`""" dictionaries: List[str] - model_path: str - tree_path: str - align_model_path: str - ali_paths: Dict[str, str] - new_ali_paths: Dict[str, str] + model_path: Path + tree_path: Path + align_model_path: Path + ali_paths: Dict[str, Path] + new_ali_paths: Dict[str, Path] class ConvertAlignmentsFunction(KaldiFunction): @@ -238,7 +239,7 @@ def tree_stats_arguments(self) -> List[TreeStatsArguments]: TreeStatsArguments( j.id, getattr(self, "db_string", ""), - os.path.join(self.working_log_directory, f"acc_tree.{j.id}.log"), + self.working_log_directory.joinpath(f"acc_tree.{j.id}.log"), j.dictionary_ids, self.worker.context_independent_csl, alignment_model_path, @@ -262,7 +263,7 @@ def convert_alignments_arguments(self) -> List[ConvertAlignmentsArguments]: ConvertAlignmentsArguments( j.id, getattr(self, "db_string", ""), - os.path.join(self.working_log_directory, f"convert_alignments.{j.id}.log"), + self.working_log_directory.joinpath(f"convert_alignments.{j.id}.log"), j.dictionary_ids, self.model_path, self.tree_path, @@ -402,12 +403,12 @@ def tree_stats(self) -> None: tree_accs = [] for x in jobs: tree_accs.extend(x.treeacc_paths.values()) - log_path = os.path.join(self.working_log_directory, "sum_tree_acc.log") + log_path = self.working_log_directory.joinpath("sum_tree_acc.log") with mfa_open(log_path, "w") as log_file: subprocess.call( [ thirdparty_binary("sum-tree-stats"), - os.path.join(self.working_directory, "treeacc"), + self.working_directory.joinpath("treeacc"), ] + tree_accs, stderr=log_file, @@ -425,15 +426,15 @@ def _setup_tree(self, init_from_previous=False, initial_mix_up=True) -> None: :class:`~montreal_forced_aligner.exceptions.KaldiProcessingError` If there were any errors in running Kaldi binaries """ - log_path = os.path.join(self.working_log_directory, "questions.log") - tree_path = os.path.join(self.working_directory, "tree") - treeacc_path = os.path.join(self.working_directory, "treeacc") + log_path = self.working_log_directory.joinpath("questions.log") + tree_path = self.working_directory.joinpath("tree") + treeacc_path = self.working_directory.joinpath("treeacc") sets_int_path = os.path.join(self.worker.phones_dir, "sets.int") roots_int_path = os.path.join(self.worker.phones_dir, "roots.int") extra_question_int_path = os.path.join(self.worker.phones_dir, "extra_questions.int") topo_path = self.worker.topo_path - questions_path = os.path.join(self.working_directory, "questions.int") - questions_qst_path = os.path.join(self.working_directory, "questions.qst") + questions_path = self.working_directory.joinpath("questions.int") + questions_qst_path = self.working_directory.joinpath("questions.qst") with mfa_open(log_path, "w") as log_file: subprocess.call( [ @@ -449,7 +450,7 @@ def _setup_tree(self, init_from_previous=False, initial_mix_up=True) -> None: for line in inf: outf.write(line) - log_path = os.path.join(self.working_log_directory, "compile_questions.log") + log_path = self.working_log_directory.joinpath("compile_questions.log") with mfa_open(log_path, "w") as log_file: subprocess.call( [ @@ -461,7 +462,7 @@ def _setup_tree(self, init_from_previous=False, initial_mix_up=True) -> None: stderr=log_file, ) - log_path = os.path.join(self.working_log_directory, "build_tree.log") + log_path = self.working_log_directory.joinpath("build_tree.log") with mfa_open(log_path, "w") as log_file: subprocess.call( [ @@ -478,8 +479,8 @@ def _setup_tree(self, init_from_previous=False, initial_mix_up=True) -> None: stderr=log_file, ) - log_path = os.path.join(self.working_log_directory, "init_model.log") - occs_path = os.path.join(self.working_directory, "0.occs") + log_path = self.working_log_directory.joinpath("init_model.log") + occs_path = self.working_directory.joinpath("0.occs") mdl_path = self.model_path if init_from_previous: command = [ @@ -521,7 +522,7 @@ def _setup_tree(self, init_from_previous=False, initial_mix_up=True) -> None: occs_path, mdl_path, ] - log_path = os.path.join(self.working_log_directory, "mixup.log") + log_path = self.working_log_directory.joinpath("mixup.log") with mfa_open(log_path, "w") as log_file: subprocess.call(command, stderr=log_file) os.remove(treeacc_path) diff --git a/montreal_forced_aligner/alignment/adapting.py b/montreal_forced_aligner/alignment/adapting.py index 1f7b18b6..312b8389 100644 --- a/montreal_forced_aligner/alignment/adapting.py +++ b/montreal_forced_aligner/alignment/adapting.py @@ -7,6 +7,7 @@ import shutil import subprocess import time +from pathlib import Path from queue import Empty from typing import TYPE_CHECKING, List @@ -96,7 +97,7 @@ def map_acc_stats_arguments(self, alignment=False) -> List[AccStatsArguments]: AccStatsArguments( j.id, getattr(self, "db_string", ""), - os.path.join(self.working_log_directory, f"map_acc_stats.{j.id}.log"), + self.working_log_directory.joinpath(f"map_acc_stats.{j.id}.log"), j.dictionary_ids, feat_strings, j.construct_path_dictionary(self.working_directory, "ali", "ark"), @@ -117,11 +118,11 @@ def acc_stats(self, alignment: bool = False) -> None: """ arguments = self.map_acc_stats_arguments(alignment) if alignment: - initial_mdl_path = os.path.join(self.working_directory, "unadapted.alimdl") - final_mdl_path = os.path.join(self.working_directory, "final.alimdl") + initial_mdl_path = self.working_directory.joinpath("unadapted.alimdl") + final_mdl_path = self.working_directory.joinpath("final.alimdl") else: - initial_mdl_path = os.path.join(self.working_directory, "unadapted.mdl") - final_mdl_path = os.path.join(self.working_directory, "final.mdl") + initial_mdl_path = self.working_directory.joinpath("unadapted.mdl") + final_mdl_path = self.working_directory.joinpath("final.mdl") logger.info("Accumulating statistics...") with tqdm.tqdm(total=self.num_current_utterances, disable=GLOBAL_CONFIG.quiet) as pbar: if GLOBAL_CONFIG.use_mp: @@ -161,8 +162,8 @@ def acc_stats(self, alignment: bool = False) -> None: function = AccStatsFunction(args) for num_utterances, errors in function.run(): pbar.update(num_utterances + errors) - log_path = os.path.join(self.working_log_directory, "map_model_est.log") - occs_path = os.path.join(self.working_directory, "final.occs") + log_path = self.working_log_directory.joinpath("map_model_est.log") + occs_path = self.working_directory.joinpath("final.occs") with mfa_open(log_path, "w") as log_file: acc_files = [] for j in arguments: @@ -211,20 +212,20 @@ def align_directory(self) -> str: @property def working_log_directory(self) -> str: """Current log directory""" - return os.path.join(self.working_directory, "log") + return self.working_directory.joinpath("log") @property def model_path(self) -> str: """Current acoustic model path""" if self.current_workflow.workflow_type == WorkflowType.acoustic_model_adaptation: - return os.path.join(self.working_directory, "unadapted.mdl") - return os.path.join(self.working_directory, "final.mdl") + return self.working_directory.joinpath("unadapted.mdl") + return self.working_directory.joinpath("final.mdl") @property def alignment_model_path(self) -> str: """Current acoustic model path""" if self.current_workflow.workflow_type == WorkflowType.acoustic_model_adaptation: - path = os.path.join(self.working_directory, "unadapted.alimdl") + path = self.working_directory.joinpath("unadapted.alimdl") if os.path.exists(path) and not getattr(self, "uses_speaker_adaptation", False): return path return self.model_path @@ -233,7 +234,7 @@ def alignment_model_path(self) -> str: @property def next_model_path(self) -> str: """Mapped acoustic model path""" - return os.path.join(self.working_directory, "final.mdl") + return self.working_directory.joinpath("final.mdl") def train_map(self) -> None: """ @@ -275,15 +276,15 @@ def adapt(self) -> None: for f in ["final.mdl", "final.alimdl"]: shutil.copyfile( os.path.join(alignment_workflow.working_directory, f), - os.path.join(self.working_directory, f.replace("final", "unadapted")), + self.working_directory.joinpath(f).with_stem("unadapted"), ) shutil.copyfile( os.path.join(alignment_workflow.working_directory, "tree"), - os.path.join(self.working_directory, "tree"), + self.working_directory.joinpath("tree"), ) shutil.copyfile( os.path.join(alignment_workflow.working_directory, "lda.mat"), - os.path.join(self.working_directory, "lda.mat"), + self.working_directory.joinpath("lda.mat"), ) for j in self.jobs: old_paths = j.construct_path_dictionary( @@ -302,27 +303,27 @@ def adapt(self) -> None: try: logger.info("Adapting pretrained model...") self.train_map() - self.export_model(os.path.join(self.working_log_directory, "acoustic_model.zip")) + self.export_model(self.working_log_directory.joinpath("acoustic_model.zip")) shutil.copyfile( - os.path.join(self.working_directory, "final.mdl"), + self.working_directory.joinpath("final.mdl"), os.path.join(self.align_directory, "final.mdl"), ) shutil.copyfile( - os.path.join(self.working_directory, "final.occs"), + self.working_directory.joinpath("final.occs"), os.path.join(self.align_directory, "final.occs"), ) shutil.copyfile( - os.path.join(self.working_directory, "tree"), + self.working_directory.joinpath("tree"), os.path.join(self.align_directory, "tree"), ) - if os.path.exists(os.path.join(self.working_directory, "final.alimdl")): + if os.path.exists(self.working_directory.joinpath("final.alimdl")): shutil.copyfile( - os.path.join(self.working_directory, "final.alimdl"), + self.working_directory.joinpath("final.alimdl"), os.path.join(self.align_directory, "final.alimdl"), ) - if os.path.exists(os.path.join(self.working_directory, "lda.mat")): + if os.path.exists(self.working_directory.joinpath("lda.mat")): shutil.copyfile( - os.path.join(self.working_directory, "lda.mat"), + self.working_directory.joinpath("lda.mat"), os.path.join(self.align_directory, "lda.mat"), ) wf = self.current_workflow @@ -360,7 +361,7 @@ def meta(self) -> MetaDict: } return data - def export_model(self, output_model_path: str) -> None: + def export_model(self, output_model_path: Path) -> None: """ Output an acoustic model to the specified path @@ -369,9 +370,11 @@ def export_model(self, output_model_path: str) -> None: output_model_path : str Path to save adapted acoustic model """ - directory, filename = os.path.split(output_model_path) - basename, _ = os.path.splitext(filename) - acoustic_model = AcousticModel.empty(basename, root_directory=self.working_log_directory) + directory = output_model_path.parent + + acoustic_model = AcousticModel.empty( + output_model_path.stem, root_directory=self.working_log_directory + ) acoustic_model.add_meta_file(self) acoustic_model.add_model(self.working_directory) if directory: diff --git a/montreal_forced_aligner/alignment/base.py b/montreal_forced_aligner/alignment/base.py index 04a918b0..683538c2 100644 --- a/montreal_forced_aligner/alignment/base.py +++ b/montreal_forced_aligner/alignment/base.py @@ -171,7 +171,7 @@ def alignment_extraction_arguments(self) -> List[AlignmentExtractionArguments]: AlignmentExtractionArguments( j.id, getattr(self, "db_string", ""), - os.path.join(self.working_log_directory, f"get_phone_ctm.{j.id}.log"), + self.working_log_directory.joinpath(f"get_phone_ctm.{j.id}.log"), self.alignment_model_path, round(self.frame_shift / 1000, 4), self.phone_symbol_table_path, @@ -203,7 +203,7 @@ def export_textgrid_arguments( ExportTextGridArguments( j.id, getattr(self, "db_string", ""), - os.path.join(self.working_log_directory, f"export_textgrids.{j.id}.log"), + self.working_log_directory.joinpath(f"export_textgrids.{j.id}.log"), self.export_frame_shift, GLOBAL_CONFIG.cleanup_textgrids, self.clitic_marker, @@ -230,7 +230,7 @@ def generate_pronunciations_arguments( GeneratePronunciationsArguments( j.id, getattr(self, "db_string", ""), - os.path.join(self.working_log_directory, f"generate_pronunciations.{j.id}.log"), + self.working_log_directory.joinpath(f"generate_pronunciations.{j.id}.log"), self.model_path, False, ) @@ -264,15 +264,15 @@ def align(self, workflow_name=None) -> None: acoustic_model is not None and acoustic_model.meta["features"]["uses_speaker_adaptation"] ): - if self.alignment_model_path.endswith(".mdl"): - if os.path.exists(self.alignment_model_path.replace(".mdl", ".alimdl")): + if self.alignment_model_path.suffix == ".mdl": + if os.path.exists(self.alignment_model_path.with_suffix(".alimdl")): raise AlignerError( "Not using speaker independent model when it is available" ) self.calc_fmllr() self.uses_speaker_adaptation = True - assert self.alignment_model_path.endswith(".mdl") + assert self.alignment_model_path.suffix == ".mdl" logger.info("Performing second-pass alignment...") self.align_utterances() self.collect_alignments() @@ -377,7 +377,7 @@ def compute_pronunciation_probabilities(self): final_silence_correction_sum = 0 final_non_silence_correction_sum = 0 with mfa_open( - os.path.join(self.working_log_directory, "pronunciation_probability_calculation.log"), + self.working_log_directory.joinpath("pronunciation_probability_calculation.log"), "w", encoding="utf8", ) as log_file, self.session() as session: @@ -952,7 +952,7 @@ def fine_tune_arguments(self) -> List[FineTuneArguments]: """ args = [] for j in self.jobs: - log_path = os.path.join(self.working_log_directory, f"fine_tune.{j.id}.log") + log_path = self.working_log_directory.joinpath(f"fine_tune.{j.id}.log") args.append( FineTuneArguments( j.id, @@ -1141,7 +1141,7 @@ def export_files( Parameters ---------- - output_directory: Path + output_directory: :class:`~pathlib.Path` Directory to save to output_format: str, optional Format to save alignments, one of 'long_textgrids' (the default), 'short_textgrids', or 'json', passed to praatio diff --git a/montreal_forced_aligner/alignment/mixins.py b/montreal_forced_aligner/alignment/mixins.py index ca309f2e..56ee9c44 100644 --- a/montreal_forced_aligner/alignment/mixins.py +++ b/montreal_forced_aligner/alignment/mixins.py @@ -8,6 +8,7 @@ import os import time from abc import abstractmethod +from pathlib import Path from queue import Empty from typing import TYPE_CHECKING, Dict, List @@ -107,9 +108,9 @@ def __init__( self.unaligned_files = set() @property - def tree_path(self) -> str: + def tree_path(self) -> Path: """Path to tree file""" - return os.path.join(self.working_directory, "tree") + return self.working_directory.joinpath("tree") @property @abstractmethod @@ -140,8 +141,8 @@ def compile_train_graphs_arguments(self) -> List[CompileTrainGraphsArguments]: CompileTrainGraphsArguments( j.id, getattr(self, "db_string", ""), - os.path.join(self.working_log_directory, f"compile_train_graphs.{j.id}.log"), - os.path.join(self.working_directory, "tree"), + self.working_log_directory.joinpath(f"compile_train_graphs.{j.id}.log"), + self.working_directory.joinpath("tree"), model_path, getattr(self, "use_g2p", False), ) @@ -161,13 +162,11 @@ def align_arguments(self) -> List[AlignArguments]: iteration = getattr(self, "iteration", None) for j in self.jobs: if iteration is not None: - log_path = os.path.join( - self.working_log_directory, f"align.{iteration}.{j.id}.log" - ) + log_path = self.working_log_directory.joinpath(f"align.{iteration}.{j.id}.log") else: - log_path = os.path.join(self.working_log_directory, f"align.{j.id}.log") + log_path = self.working_log_directory.joinpath(f"align.{j.id}.log") if getattr(self, "uses_speaker_adaptation", False): - log_path = log_path.replace(".log", ".fmllr.log") + log_path = log_path.with_suffix(".fmllr.log") args.append( AlignArguments( j.id, @@ -196,7 +195,7 @@ def phone_confidence_arguments(self) -> List[PhoneConfidenceArguments]: """ args = [] for j in self.jobs: - log_path = os.path.join(self.working_log_directory, f"phone_confidence.{j.id}.log") + log_path = self.working_log_directory.joinpath(f"phone_confidence.{j.id}.log") feat_strings = {} for d in j.dictionaries: @@ -237,12 +236,12 @@ def compile_information_arguments(self) -> List[CompileInformationArguments]: self.working_log_directory, f"align.{iteration}.{j.id}.log" ) else: - log_path = os.path.join(self.working_log_directory, f"align.{j.id}.log") + log_path = self.working_log_directory.joinpath(f"align.{j.id}.log") args.append( CompileInformationArguments( j.id, getattr(self, "db_string", ""), - os.path.join(self.working_log_directory, f"compile_information.{j.id}.log"), + self.working_log_directory.joinpath(f"compile_information.{j.id}.log"), log_path, ) ) @@ -553,7 +552,7 @@ def compile_information(self) -> None: average_logdet_sum += data["logdet"] * data["logdet_frames"] if hasattr(self, "db_engine"): - csv_path = os.path.join(self.working_directory, "alignment_log_likelihood.csv") + csv_path = self.working_directory.joinpath("alignment_log_likelihood.csv") with mfa_open(csv_path, "w") as f, self.session() as session: writer = csv.writer(f) writer.writerow(["file", "begin", "end", "speaker", "loglikelihood"]) @@ -598,30 +597,30 @@ def compile_information(self) -> None: @property @abstractmethod - def working_directory(self) -> str: + def working_directory(self) -> Path: """Working directory""" ... @property @abstractmethod - def working_log_directory(self) -> str: + def working_log_directory(self) -> Path: """Working log directory""" ... @property - def model_path(self) -> str: + def model_path(self) -> Path: """Acoustic model file path""" - return os.path.join(self.working_directory, "final.mdl") + return self.working_directory.joinpath("final.mdl") @property - def phone_pdf_counts_path(self) -> str: + def phone_pdf_counts_path(self) -> Path: """Acoustic model file path""" - return os.path.join(self.working_directory, "phone_pdf.counts") + return self.working_directory.joinpath("phone_pdf.counts") @property - def alignment_model_path(self) -> str: + def alignment_model_path(self) -> Path: """Acoustic model file path for speaker-independent alignment""" - path = os.path.join(self.working_directory, "final.alimdl") + path = self.working_directory.joinpath("final.alimdl") if os.path.exists(path) and not getattr(self, "uses_speaker_adaptation", False): return path return self.model_path diff --git a/montreal_forced_aligner/alignment/multiprocessing.py b/montreal_forced_aligner/alignment/multiprocessing.py index df474017..68e479d9 100644 --- a/montreal_forced_aligner/alignment/multiprocessing.py +++ b/montreal_forced_aligner/alignment/multiprocessing.py @@ -211,19 +211,19 @@ class GeneratePronunciationsArguments(MfaArguments): Integer ID of the job db_string: str String for database connections - log_path: str + log_path: :class:`~pathlib.Path` Path to save logging information during the run - text_int_paths: dict[int, str] + text_int_paths: dict[int, Path] Per dictionary text SCP paths - ali_paths: dict[int, str] + ali_paths: dict[int, Path] Per dictionary alignment paths - model_path: str + model_path: :class:`~pathlib.Path` Acoustic model path for_g2p: bool Flag for training a G2P model with acoustic information """ - model_path: str + model_path: Path for_g2p: bool @@ -238,25 +238,25 @@ class AlignmentExtractionArguments(MfaArguments): Integer ID of the job db_string: str String for database connections - log_path: str + log_path: :class:`~pathlib.Path` Path to save logging information during the run - model_path: str + model_path: :class:`~pathlib.Path` Acoustic model path frame_shift: float Frame shift in seconds - ali_paths: dict[int, str] + ali_paths: dict[int, Path] Per dictionary alignment paths - text_int_paths: dict[int, str] + text_int_paths: dict[int, Path] Per dictionary text SCP paths - phone_symbol_path: str + phone_symbol_path: :class:`~pathlib.Path` Path to phone symbols table score_options: dict[str, Any] Options for Kaldi functions """ - model_path: str + model_path: Path frame_shift: float - phone_symbol_path: str + phone_symbol_path: Path score_options: MetaDict confidence: bool transcription: bool @@ -273,7 +273,7 @@ class ExportTextGridArguments(MfaArguments): Integer ID of the job db_string: str String for database connections - log_path: str + log_path: :class:`~pathlib.Path` Path to save logging information during the run export_frame_shift: float Frame shift in seconds @@ -281,20 +281,18 @@ class ExportTextGridArguments(MfaArguments): Flag to cleanup silences and recombine words clitic_marker: str Marker indicating clitics - output_directory: str + output_directory: :class:`~pathlib.Path` Directory for exporting output_format: str Format to export include_original_text: bool Flag for including original unnormalized text as a tier - workflow_id: int - Integer id of workflow to export """ export_frame_shift: float cleanup_textgrids: bool clitic_marker: str - output_directory: str + output_directory: Path output_format: str include_original_text: bool @@ -310,13 +308,13 @@ class CompileInformationArguments(MfaArguments): Integer ID of the job db_string: str String for database connections - log_path: str + log_path: :class:`~pathlib.Path` Path to save logging information during the run - align_log_path: str + align_log_path: :class:`~pathlib.Path` Path to log file for parsing """ - align_log_path: str + align_log_path: Path @dataclass @@ -330,22 +328,18 @@ class CompileTrainGraphsArguments(MfaArguments): Integer ID of the job db_string: str String for database connections - log_path: str + log_path: :class:`~pathlib.Path` Path to save logging information during the run - dictionaries: list[int] - List of dictionary ids - tree_path: str + tree_path: :class:`~pathlib.Path` Path to tree file - model_path: str + model_path: :class:`~pathlib.Path` Path to model file - text_int_paths: dict[int, str] - Mapping of dictionaries to text scp files - fst_ark_paths: dict[int, str] - Mapping of dictionaries to fst ark files + use_g2p: bool + Flag for whether acoustic model uses g2p """ - tree_path: str - model_path: str + tree_path: Path + model_path: Path use_g2p: bool @@ -360,23 +354,19 @@ class AlignArguments(MfaArguments): Integer ID of the job db_string: str String for database connections - log_path: str + log_path: :class:`~pathlib.Path` Path to save logging information during the run - dictionaries: list[int] - List of dictionary ids - fst_ark_paths: dict[int, str] - Mapping of dictionaries to fst ark files - feature_strings: dict[int, str] - Mapping of dictionaries to feature generation strings - model_path: str + model_path: :class:`~pathlib.Path` Path to model file - ali_paths: dict[int, str] - Per dictionary alignment paths align_options: dict[str, Any] Alignment options + feature_options: dict[str, Any] + Feature options + confidence: bool + Flag for outputting confidence """ - model_path: str + model_path: Path align_options: MetaDict feature_options: MetaDict confidence: bool @@ -393,40 +383,30 @@ class FineTuneArguments(MfaArguments): Integer ID of the job db_string: str String for database connections - log_path: str + log_path: :class:`~pathlib.Path` Path to save logging information during the run - working_directory: str - Current working directory - tree_path: str + tree_path: :class:`~pathlib.Path` Path to tree file - model_path: str + model_path: :class:`~pathlib.Path` Path to model file frame_shift: int Frame shift in ms - cmvn_paths: dict[int, str] - Mapping of dictionaries to CMVN scp paths - fmllr_paths: dict[int, str] - Mapping of dictionaries to fMLLR ark paths - lda_mat_path: str, optional - Path to LDA matrix file mfcc_options: dict[str, Any] MFCC computation options pitch_options: dict[str, Any] Pitch computation options align_options: dict[str, Any] Alignment options - workflow_id: int - Integer ID for workflow to fine tune position_dependent_phones: bool Flag for whether to use position dependent phones grouped_phones: dict[str, list[str]] Grouped lists of phones """ - phone_symbol_table_path: str - disambiguation_symbols_int_path: str - tree_path: str - model_path: str + phone_symbol_table_path: Path + disambiguation_symbols_int_path: Path + tree_path: Path + model_path: Path frame_shift: int mfcc_options: MetaDict pitch_options: MetaDict @@ -447,18 +427,18 @@ class PhoneConfidenceArguments(MfaArguments): Integer ID of the job db_string: str String for database connections - log_path: str + log_path: :class:`~pathlib.Path` Path to save logging information during the run - model_path: str + model_path: :class:`~pathlib.Path` Path to model file - phone_pdf_counts_path: str + phone_pdf_counts_path: :class:`~pathlib.Path` Path to output PDF counts feature_strings: dict[int, str] Mapping of dictionaries to feature generation strings """ - model_path: str - phone_pdf_counts_path: str + model_path: Path + phone_pdf_counts_path: Path feature_strings: Dict[int, str] @@ -473,25 +453,25 @@ class AccStatsArguments(MfaArguments): Integer ID of the job db_string: str String for database connections - log_path: str + log_path: :class:`~pathlib.Path` Path to save logging information during the run dictionaries: list[int] List of dictionary ids feature_strings: dict[int, str] Mapping of dictionaries to feature generation strings - ali_paths: dict[int, str] + ali_paths: dict[int, Path] Per dictionary alignment paths - acc_paths: dict[int, str] + acc_paths: dict[int, Path] Per dictionary accumulated stats paths - model_path: str + model_path: :class:`~pathlib.Path` Path to model file """ dictionaries: List[int] feature_strings: Dict[int, str] - ali_paths: Dict[int, str] - acc_paths: Dict[int, str] - model_path: str + ali_paths: Dict[int, Path] + acc_paths: Dict[int, Path] + model_path: Path class CompileTrainGraphsFunction(KaldiFunction): @@ -1735,7 +1715,7 @@ def compile_information_func( data = {"unaligned": [], "too_short": [], "log_like": 0, "total_frames": 0} align_log_path = arguments.align_log_path if not os.path.exists(align_log_path): - align_log_path = align_log_path.replace(".log", ".fmllr.log") + align_log_path = align_log_path.with_suffix(".fmllr.log") with mfa_open(arguments.log_path, "w"), mfa_open(align_log_path, "r") as f: for line in f: decode_error_match = re.match(decode_error_pattern, line) diff --git a/montreal_forced_aligner/alignment/pretrained.py b/montreal_forced_aligner/alignment/pretrained.py index 0f49c3c6..4ffd1033 100644 --- a/montreal_forced_aligner/alignment/pretrained.py +++ b/montreal_forced_aligner/alignment/pretrained.py @@ -7,6 +7,7 @@ import shutil import time import typing +from pathlib import Path from typing import TYPE_CHECKING, Any, Dict, Optional import sqlalchemy @@ -67,7 +68,7 @@ class PretrainedAligner(TranscriberMixin, TopLevelMfaWorker): def __init__( self, - acoustic_model_path: str = None, + acoustic_model_path: Path = None, **kwargs, ): self.acoustic_model = AcousticModel(acoustic_model_path) @@ -77,17 +78,10 @@ def __init__( def setup_acoustic_model(self) -> None: """Set up the acoustic model""" - if self.acoustic_model.meta["version"] < "2.1": - logger.warning( - "The acoustic model was trained in an earlier version of MFA. " - "There may be incompatibilities in feature generation that cause errors. " - "Please download the latest version of the model via `mfa model download`, " - "use a different acoustic model, or use version 2.0.6 of MFA." - ) self.acoustic_model.export_model(self.working_directory) os.makedirs(self.phones_dir, exist_ok=True) for f in ["phones.txt", "graphemes.txt"]: - path = os.path.join(self.working_directory, f) + path = self.working_directory.joinpath(f) if os.path.exists(path): os.rename(path, os.path.join(self.phones_dir, f)) dict_info = self.acoustic_model.meta.get("dictionaries", None) @@ -229,7 +223,7 @@ def setup(self) -> None: @classmethod def parse_parameters( cls, - config_path: Optional[str] = None, + config_path: Optional[Path] = None, args: Optional[Dict[str, Any]] = None, unknown_args: Optional[typing.Iterable[str]] = None, ) -> MetaDict: @@ -238,7 +232,7 @@ def parse_parameters( Parameters ---------- - config_path: str + config_path: :class:`~pathlib.Path` Config path args: dict[str, Any] Parsed arguments @@ -301,7 +295,7 @@ def align_one_utterance(self, utterance: Utterance, session: Session) -> None: session.flush() if not sox_string: sox_string = utterance.file.sound_file.sound_file_path - text_int_path = os.path.join(self.working_directory, "text.int") + text_int_path = self.working_directory.joinpath("text.int") with mfa_open(text_int_path, "w") as f: normalized_text_int = " ".join( [ @@ -311,20 +305,20 @@ def align_one_utterance(self, utterance: Utterance, session: Session) -> None: ) f.write(f"{utterance.kaldi_id} {normalized_text_int}\n") if utterance.features: - feats_path = os.path.join(self.working_directory, "feats.scp") + feats_path = self.working_directory.joinpath("feats.scp") with mfa_open(feats_path, "w") as f: f.write(f"{utterance.kaldi_id} {utterance.features}\n") else: - wav_path = os.path.join(self.working_directory, "wav.scp") - segment_path = os.path.join(self.working_directory, "segments.scp") + wav_path = self.working_directory.joinpath("wav.scp") + segment_path = self.working_directory.joinpath("segments.scp") with mfa_open(wav_path, "w") as f: f.write(f"{utterance.file_id} {sox_string}\n") with mfa_open(segment_path, "w") as f: f.write( f"{utterance.kaldi_id} {utterance.file_id} {utterance.begin} {utterance.end} {utterance.channel}\n" ) - spk2utt_path = os.path.join(self.working_directory, "spk2utt.scp") - utt2spk_path = os.path.join(self.working_directory, "utt2spk.scp") + spk2utt_path = self.working_directory.joinpath("spk2utt.scp") + utt2spk_path = self.working_directory.joinpath("utt2spk.scp") with mfa_open(spk2utt_path, "w") as f: f.write(f"{utterance.speaker.id} {utterance.kaldi_id}\n") with mfa_open(utt2spk_path, "w") as f: @@ -333,7 +327,7 @@ def align_one_utterance(self, utterance: Utterance, session: Session) -> None: args = OnlineAlignmentArguments( 0, self.db_string, - os.path.join(self.working_directory, "align.log"), + self.working_directory.joinpath("align.log"), self.working_directory, sox_string, utterance.to_data(), diff --git a/montreal_forced_aligner/command_line/align.py b/montreal_forced_aligner/command_line/align.py index 49ad45c1..5edc1551 100644 --- a/montreal_forced_aligner/command_line/align.py +++ b/montreal_forced_aligner/command_line/align.py @@ -130,7 +130,7 @@ def align_corpus_cli(context, **kwargs) -> None: mapping = None if custom_mapping_path: with mfa_open(custom_mapping_path, "r") as f: - mapping = yaml.safe_load(f) + mapping = yaml.load(f, Loader=yaml.Loader) aligner.load_reference_alignments(reference_directory) reference_alignments = WorkflowType.reference else: diff --git a/montreal_forced_aligner/command_line/g2p.py b/montreal_forced_aligner/command_line/g2p.py index e55473eb..19855af2 100644 --- a/montreal_forced_aligner/command_line/g2p.py +++ b/montreal_forced_aligner/command_line/g2p.py @@ -35,7 +35,7 @@ @click.option( "--config_path", "-c", - help="Path to config file to use for training.", + help="Path to config file to use for G2P.", type=click.Path(exists=True, file_okay=True, dir_okay=False, path_type=Path), ) @click.option( diff --git a/montreal_forced_aligner/command_line/mfa.py b/montreal_forced_aligner/command_line/mfa.py index 754571d0..0e27f66b 100644 --- a/montreal_forced_aligner/command_line/mfa.py +++ b/montreal_forced_aligner/command_line/mfa.py @@ -19,11 +19,13 @@ from montreal_forced_aligner.command_line.g2p import g2p_cli from montreal_forced_aligner.command_line.history import history_cli from montreal_forced_aligner.command_line.model import model_cli +from montreal_forced_aligner.command_line.tokenize import tokenize_cli from montreal_forced_aligner.command_line.train_acoustic_model import train_acoustic_model_cli from montreal_forced_aligner.command_line.train_dictionary import train_dictionary_cli from montreal_forced_aligner.command_line.train_g2p import train_g2p_cli from montreal_forced_aligner.command_line.train_ivector_extractor import train_ivector_cli from montreal_forced_aligner.command_line.train_lm import train_lm_cli +from montreal_forced_aligner.command_line.train_tokenizer import train_tokenizer_cli from montreal_forced_aligner.command_line.transcribe import transcribe_corpus_cli from montreal_forced_aligner.command_line.validate import ( validate_corpus_cli, @@ -141,11 +143,13 @@ def version_cli(): mfa_cli.add_command(g2p_cli) mfa_cli.add_command(model_cli, name="model") mfa_cli.add_command(model_cli, name="models") +mfa_cli.add_command(tokenize_cli) mfa_cli.add_command(train_acoustic_model_cli) mfa_cli.add_command(train_dictionary_cli) mfa_cli.add_command(train_g2p_cli) mfa_cli.add_command(train_ivector_cli) mfa_cli.add_command(train_lm_cli) +mfa_cli.add_command(train_tokenizer_cli) mfa_cli.add_command(transcribe_corpus_cli) mfa_cli.add_command(validate_corpus_cli) mfa_cli.add_command(validate_dictionary_cli) diff --git a/montreal_forced_aligner/command_line/model.py b/montreal_forced_aligner/command_line/model.py index cac4ebe3..5d5686cd 100644 --- a/montreal_forced_aligner/command_line/model.py +++ b/montreal_forced_aligner/command_line/model.py @@ -153,13 +153,13 @@ def inspect_model_cli(model_type: str, model: str) -> None: default=GLOBAL_CONFIG.overwrite, ) @click.help_option("-h", "--help") -def save_model_cli(path: str, model_type: str, name: str, overwrite: bool) -> None: +def save_model_cli(path: Path, model_type: str, name: str, overwrite: bool) -> None: """ Save a model to pretrained folder for later use Parameters ---------- - path: str + path: :class:`~pathlib.Path` Path to model model_type: str Type of model diff --git a/montreal_forced_aligner/command_line/tokenize.py b/montreal_forced_aligner/command_line/tokenize.py new file mode 100644 index 00000000..d46b0c1d --- /dev/null +++ b/montreal_forced_aligner/command_line/tokenize.py @@ -0,0 +1,76 @@ +"""Command line functions for generating pronunciations using G2P models""" +from __future__ import annotations + +import os +from pathlib import Path + +import click + +from montreal_forced_aligner.command_line.utils import ( + check_databases, + cleanup_databases, + common_options, + validate_tokenizer_model, +) +from montreal_forced_aligner.config import GLOBAL_CONFIG, MFA_PROFILE_VARIABLE +from montreal_forced_aligner.tokenization.tokenizer import CorpusTokenizer + +__all__ = ["tokenize_cli"] + + +@click.command( + name="tokenize", + context_settings=dict( + ignore_unknown_options=True, + allow_extra_args=True, + allow_interspersed_args=True, + ), + short_help="Tokenize utterances", +) +@click.argument( + "input_path", type=click.Path(exists=True, file_okay=True, dir_okay=True, path_type=Path) +) +@click.argument("tokenizer_model_path", type=click.UNPROCESSED, callback=validate_tokenizer_model) +@click.argument( + "output_directory", type=click.Path(file_okay=False, dir_okay=True, path_type=Path) +) +@click.option( + "--config_path", + "-c", + help="Path to config file to use for training.", + type=click.Path(exists=True, file_okay=True, dir_okay=False, path_type=Path), +) +@common_options +@click.help_option("-h", "--help") +@click.pass_context +def tokenize_cli(context, **kwargs) -> None: + """ + Tokenize utterances with a trained tokenizer model + """ + if kwargs.get("profile", None) is not None: + os.putenv(MFA_PROFILE_VARIABLE, kwargs["profile"]) + GLOBAL_CONFIG.current_profile.update(kwargs) + GLOBAL_CONFIG.save() + check_databases() + + config_path = kwargs.get("config_path", None) + input_path = kwargs["input_path"] + tokenizer_model_path = kwargs["tokenizer_model_path"] + output_directory = kwargs["output_directory"] + + tokenizer = CorpusTokenizer( + corpus_directory=input_path, + tokenizer_model_path=tokenizer_model_path, + **CorpusTokenizer.parse_parameters(config_path, context.params, context.args), + ) + + try: + tokenizer.setup() + tokenizer.tokenize_utterances() + tokenizer.export_files(output_directory) + except Exception: + tokenizer.dirty = True + raise + finally: + tokenizer.cleanup() + cleanup_databases() diff --git a/montreal_forced_aligner/command_line/train_tokenizer.py b/montreal_forced_aligner/command_line/train_tokenizer.py new file mode 100644 index 00000000..71325e36 --- /dev/null +++ b/montreal_forced_aligner/command_line/train_tokenizer.py @@ -0,0 +1,81 @@ +"""Command line functions for training G2P models""" +from __future__ import annotations + +import os +from pathlib import Path + +import click + +from montreal_forced_aligner.command_line.utils import ( + check_databases, + cleanup_databases, + common_options, +) +from montreal_forced_aligner.config import GLOBAL_CONFIG, MFA_PROFILE_VARIABLE +from montreal_forced_aligner.tokenization.trainer import TokenizerTrainer + +__all__ = ["train_tokenizer_cli"] + + +@click.command( + name="train_tokenizer", + context_settings=dict( + ignore_unknown_options=True, + allow_extra_args=True, + allow_interspersed_args=True, + ), + short_help="Train a tokenizer model", +) +@click.argument( + "corpus_directory", + type=click.Path(exists=True, file_okay=False, dir_okay=True, path_type=Path), +) +@click.argument( + "output_model_path", type=click.Path(file_okay=True, dir_okay=False, path_type=Path) +) +@click.option( + "--config_path", + "-c", + help="Path to config file to use for training.", + type=click.Path(exists=True, file_okay=True, dir_okay=False, path_type=Path), +) +@click.option( + "--evaluate", + "--validate", + "evaluation_mode", + is_flag=True, + help="Perform an analysis of accuracy training on " + "most of the data and validating on an unseen subset.", + default=False, +) +@common_options +@click.help_option("-h", "--help") +@click.pass_context +def train_tokenizer_cli(context, **kwargs) -> None: + """ + Train a tokenizer model from a tokenized corpus. + """ + if kwargs.get("profile", None) is not None: + os.putenv(MFA_PROFILE_VARIABLE, kwargs["profile"]) + GLOBAL_CONFIG.current_profile.update(kwargs) + GLOBAL_CONFIG.save() + check_databases() + config_path = kwargs.get("config_path", None) + corpus_directory = kwargs["corpus_directory"] + output_model_path = kwargs["output_model_path"] + trainer = TokenizerTrainer( + corpus_directory=corpus_directory, + **TokenizerTrainer.parse_parameters(config_path, context.params, context.args), + ) + + try: + trainer.setup() + trainer.train() + trainer.export_model(output_model_path) + + except Exception: + trainer.dirty = True + raise + finally: + trainer.cleanup() + cleanup_databases() diff --git a/montreal_forced_aligner/command_line/utils.py b/montreal_forced_aligner/command_line/utils.py index 0d5bf6f2..3571469c 100644 --- a/montreal_forced_aligner/command_line/utils.py +++ b/montreal_forced_aligner/command_line/utils.py @@ -164,7 +164,7 @@ def validate_model_arg(name: str, model_type: str) -> Path: raise click.BadParameter(str(FileArgumentNotFoundError(name))) if model_type == "dictionary" and name.suffix.lower() == ".yaml": with mfa_open(name, "r") as f: - data = yaml.safe_load(f) + data = yaml.load(f, Loader=yaml.Loader) paths = sorted(set(data.values())) for path in paths: validate_model_arg(path, "dictionary") @@ -202,10 +202,15 @@ def validate_language_model(ctx, param, value): def validate_g2p_model(ctx, param, value): - """Validation callback for G2O model paths""" + """Validation callback for G2P model paths""" return validate_model_arg(value, "g2p") +def validate_tokenizer_model(ctx, param, value): + """Validation callback for tokenizer model paths""" + return validate_model_arg(value, "tokenizer") + + def validate_ivector_extractor(ctx, param, value): """Validation callback for ivector extractor paths""" if value == "speechbrain": diff --git a/montreal_forced_aligner/config.py b/montreal_forced_aligner/config.py index 11449fca..e6cf1ee2 100644 --- a/montreal_forced_aligner/config.py +++ b/montreal_forced_aligner/config.py @@ -98,7 +98,7 @@ def load_command_history() -> List[Dict[str, Any]]: history = [] if path.exists(): with mfa_open(path, "r") as f: - history = yaml.safe_load(f) + history = yaml.load(f, Loader=yaml.Loader) if not history: history = [] for h in history: @@ -125,7 +125,7 @@ def update_command_history(command_data: Dict[str, Any]) -> None: history.append(command_data) history = history[-50:] with mfa_open(path, "w") as f: - yaml.safe_dump(history, f, allow_unicode=True) + yaml.dump(history, f, Dumper=yaml.Dumper, allow_unicode=True) @dataclass(slots=True) diff --git a/montreal_forced_aligner/corpus/acoustic_corpus.py b/montreal_forced_aligner/corpus/acoustic_corpus.py index 1d366f68..72bbf18e 100644 --- a/montreal_forced_aligner/corpus/acoustic_corpus.py +++ b/montreal_forced_aligner/corpus/acoustic_corpus.py @@ -9,6 +9,7 @@ import time import typing from abc import ABCMeta +from pathlib import Path from queue import Empty from typing import List, Optional @@ -115,7 +116,7 @@ def __init__(self, audio_directory: Optional[str] = None, **kwargs): self.transcription_done = False self.alignment_evaluation_done = False - def has_alignments(self, workflow_id: typing.Optional[int] = None): + def has_alignments(self, workflow_id: typing.Optional[int] = None) -> bool: with self.session() as session: if workflow_id is None: check = session.query(PhoneInterval).limit(1).first() is not None @@ -134,7 +135,7 @@ def has_alignments(self, workflow_id: typing.Optional[int] = None): ) return check - def has_ivectors(self): + def has_ivectors(self) -> bool: with self.session() as session: check = ( session.query(Corpus) @@ -145,7 +146,7 @@ def has_ivectors(self): ) return check - def has_xvectors(self): + def has_xvectors(self) -> bool: with self.session() as session: check = ( session.query(Corpus) @@ -156,7 +157,7 @@ def has_xvectors(self): ) return check - def has_any_ivectors(self): + def has_any_ivectors(self) -> bool: with self.session() as session: check = ( session.query(Corpus) @@ -208,13 +209,13 @@ def inspect_database(self) -> None: ) session.commit() - def load_reference_alignments(self, reference_directory: str) -> None: + def load_reference_alignments(self, reference_directory: Path) -> None: """ Load reference alignments to use in alignment evaluation from a directory Parameters ---------- - reference_directory: str + reference_directory: :class:`~pathlib.Path` Directory containing reference alignments """ @@ -381,7 +382,7 @@ def generate_final_features(self) -> None: """ logger.info("Generating final features...") time_begin = time.time() - log_directory = os.path.join(self.split_directory, "log") + log_directory = self.split_directory.joinpath("log") os.makedirs(log_directory, exist_ok=True) arguments = self.final_feature_arguments() with tqdm.tqdm(total=self.num_utterances, disable=GLOBAL_CONFIG.quiet) as pbar: @@ -486,7 +487,7 @@ def create_corpus_split(self) -> None: super().create_corpus_split() else: logger.info("Creating corpus split for feature generation...") - os.makedirs(os.path.join(self.split_directory, "log"), exist_ok=True) + os.makedirs(self.split_directory.joinpath("log"), exist_ok=True) with self.session() as session, tqdm.tqdm( total=self.num_utterances + self.num_files, disable=GLOBAL_CONFIG.quiet ) as pbar: @@ -514,7 +515,7 @@ def compute_vad_arguments(self) -> List[VadArguments]: VadArguments( j.id, getattr(self, "db_string", ""), - os.path.join(self.split_directory, "log", f"compute_vad.{j.id}.log"), + self.split_directory.joinpath("log", f"compute_vad.{j.id}.log"), j.construct_path(self.split_directory, "feats", "scp"), j.construct_path(self.split_directory, "vad", "scp"), self.vad_options, @@ -550,7 +551,7 @@ def calc_fmllr_arguments(self, iteration: Optional[int] = None) -> List[CalcFmll CalcFmllrArguments( j.id, getattr(self, "db_string", ""), - os.path.join(self.working_log_directory, f"{base_log}.{j.id}.log"), + self.working_log_directory.joinpath(f"{base_log}.{j.id}.log"), j.dictionary_ids, feat_strings, j.construct_path_dictionary(self.working_directory, "ali", "ark"), @@ -576,7 +577,7 @@ def mfcc_arguments(self) -> List[MfccArguments]: MfccArguments( j.id, self.db_string, - os.path.join(self.split_directory, "log", f"make_mfcc.{j.id}.log"), + self.split_directory.joinpath("log", f"make_mfcc.{j.id}.log"), self.split_directory, self.mfcc_options, self.pitch_options, @@ -597,7 +598,7 @@ def final_feature_arguments(self) -> List[FinalFeatureArguments]: FinalFeatureArguments( j.id, self.db_string, - os.path.join(self.split_directory, "log", f"generate_final_features.{j.id}.log"), + self.split_directory.joinpath("log", f"generate_final_features.{j.id}.log"), self.split_directory, self.uses_cmvn, self.uses_voiced, @@ -619,7 +620,10 @@ def pitch_range_arguments(self) -> List[PitchRangeArguments]: PitchRangeArguments( j.id, self.db_string, - os.path.join(self.split_directory, "log", f"compute_pitch_range.{j.id}.log"), + self.split_directory, + "log", + f"compute_pitch_range.{j.id}.log", + ).joinpath( self.split_directory, self.pitch_options, ) @@ -628,7 +632,7 @@ def pitch_range_arguments(self) -> List[PitchRangeArguments]: def compute_speaker_pitch_ranges(self): logger.info("Calculating per-speaker f0 ranges...") - log_directory = os.path.join(self.split_directory, "log") + log_directory = self.split_directory.joinpath("log") os.makedirs(log_directory, exist_ok=True) arguments = self.pitch_range_arguments() update_mapping = [] @@ -658,7 +662,7 @@ def mfcc(self) -> None: """ logger.info("Generating MFCCs...") begin = time.time() - log_directory = os.path.join(self.split_directory, "log") + log_directory = self.split_directory.joinpath("log") os.makedirs(log_directory, exist_ok=True) arguments = self.mfcc_arguments() with tqdm.tqdm(total=self.num_utterances, disable=GLOBAL_CONFIG.quiet) as pbar: @@ -676,11 +680,11 @@ def calc_cmvn(self) -> None: Relevant Kaldi binary """ self._write_spk2utt() - spk2utt = os.path.join(self.corpus_output_directory, "spk2utt.scp") - feats = os.path.join(self.corpus_output_directory, "feats.scp") - cmvn_ark = os.path.join(self.corpus_output_directory, "cmvn.ark") - cmvn_scp = os.path.join(self.corpus_output_directory, "cmvn.scp") - log_path = os.path.join(self.features_log_directory, "cmvn.log") + spk2utt = self.corpus_output_directory.joinpath("spk2utt.scp") + feats = self.corpus_output_directory.joinpath("feats.scp") + cmvn_ark = self.corpus_output_directory.joinpath("cmvn.ark") + cmvn_scp = self.corpus_output_directory.joinpath("cmvn.scp") + log_path = self.features_log_directory.joinpath("cmvn.log") with mfa_open(log_path, "w") as logf: subprocess.call( [ @@ -844,7 +848,7 @@ def compute_vad(self) -> None: bulk_update(session, Utterance, utterance_mapping) session.query(Corpus).update({Corpus.vad_calculated: True}) session.commit() - with mfa_open(os.path.join(self.corpus_output_directory, "vad.scp"), "w") as outf: + with mfa_open(self.corpus_output_directory.joinpath("vad.scp"), "w") as outf: for line in sorted(vad_lines, key=lambda x: x.split(maxsplit=1)[0]): outf.write(line) logger.debug(f"VAD computation took {time.time() - begin:.3f} seconds") @@ -858,16 +862,14 @@ def combine_feats(self) -> None: with mfa_open(j.feats_scp_path) as f: for line in f: lines.append(line) - with open( - os.path.join(self.corpus_output_directory, "feats.scp"), "w", encoding="utf8" - ) as f: + with open(self.corpus_output_directory.joinpath("feats.scp"), "w", encoding="utf8") as f: for line in sorted(lines): f.write(line) def _write_feats(self) -> None: """Write feats scp file for Kaldi""" with self.session() as session, open( - os.path.join(self.corpus_output_directory, "feats.scp"), "w", encoding="utf8" + self.corpus_output_directory.joinpath("feats.scp"), "w", encoding="utf8" ) as f: utterances = ( session.query(Utterance.kaldi_id, Utterance.features) @@ -889,7 +891,7 @@ def get_feat_dim(self) -> int: """ job = self.jobs[0] dict_id = None - log_path = os.path.join(self.features_log_directory, "feat-to-dim.log") + log_path = self.features_log_directory.joinpath("feat-to-dim.log") if job.dictionary_ids: dict_id = self.jobs[0].dictionary_ids[0] feature_string = job.construct_feature_proc_string( @@ -901,7 +903,7 @@ def get_feat_dim(self) -> int: self.feature_options["uses_speaker_adaptation"], ) with mfa_open(log_path, "w") as log_file: - subset_ark_path = os.path.join(self.split_directory, "temp.ark") + subset_ark_path = self.split_directory.joinpath("temp.ark") subset_proc = subprocess.Popen( [ thirdparty_binary("subset-feats"), @@ -1230,12 +1232,12 @@ def identifier(self) -> str: return self.data_source_identifier @property - def output_directory(self) -> str: + def output_directory(self) -> Path: """Root temporary directory to store corpus and dictionary files""" - return os.path.join(GLOBAL_CONFIG.temporary_directory, self.identifier) + return GLOBAL_CONFIG.current_profile.temporary_directory.joinpath(self.identifier) @property - def working_directory(self) -> str: + def working_directory(self) -> Path: """Working directory to save temporary corpus and dictionary files""" return self.corpus_output_directory @@ -1254,11 +1256,11 @@ def identifier(self) -> str: return self.data_source_identifier @property - def output_directory(self) -> str: + def output_directory(self) -> Path: """Root temporary directory to store corpus and dictionary files""" - return os.path.join(GLOBAL_CONFIG.temporary_directory, self.identifier) + return GLOBAL_CONFIG.current_profile.temporary_directory.joinpath(self.identifier) @property - def working_directory(self) -> str: + def working_directory(self) -> Path: """Working directory to save temporary corpus and dictionary files""" return self.output_directory diff --git a/montreal_forced_aligner/corpus/base.py b/montreal_forced_aligner/corpus/base.py index 5713a7ad..d32e42c7 100644 --- a/montreal_forced_aligner/corpus/base.py +++ b/montreal_forced_aligner/corpus/base.py @@ -6,6 +6,7 @@ import time import typing from abc import ABCMeta, abstractmethod +from pathlib import Path import sqlalchemy.engine import tqdm @@ -153,7 +154,7 @@ def inspect_database(self) -> None: session.add( Corpus( name=self.data_source_identifier, - path=str(self.corpus_directory), + path=self.corpus_directory, data_directory=self.corpus_output_directory, ) ) @@ -235,8 +236,10 @@ def get_file( :class:`~montreal_forced_aligner.db.File` File match """ + close = False if session is None: session = self.session() + close = True file = session.query(File).options( selectinload(File.utterances).joinedload(Utterance.speaker, innerjoin=True), joinedload(File.sound_file, innerjoin=True), @@ -247,11 +250,15 @@ def get_file( file = file.get(id) if not file: raise Exception(f"Could not find utterance with id of {id}") + if close: + session.close() return file else: file = file.filter(File.name == name).first() if not file: raise Exception(f"Could not find utterance with name of {name}") + if close: + session.close() return file @property @@ -260,15 +267,15 @@ def corpus_meta(self) -> typing.Dict[str, typing.Any]: return {} @property - def features_log_directory(self) -> str: + def features_log_directory(self) -> Path: """Feature log directory""" - return os.path.join(self.split_directory, "log") + return self.split_directory.joinpath("log") @property - def split_directory(self) -> str: + def split_directory(self) -> Path: """Directory used to store information split by job""" - return os.path.join( - self.corpus_output_directory, f"split{GLOBAL_CONFIG.current_profile.num_jobs}" + return self.corpus_output_directory.joinpath( + f"split{GLOBAL_CONFIG.current_profile.num_jobs}" ) def _write_spk2utt(self) -> None: @@ -289,12 +296,12 @@ def _write_spk2utt(self) -> None: data[speaker_id].append(utt_id) utt2spk_data[utt_id] = speaker_id - output_mapping(utt2spk_data, os.path.join(self.corpus_output_directory, "utt2spk.scp")) - output_mapping(data, os.path.join(self.corpus_output_directory, "spk2utt.scp")) + output_mapping(utt2spk_data, self.corpus_output_directory.joinpath("utt2spk.scp")) + output_mapping(data, self.corpus_output_directory.joinpath("spk2utt.scp")) def create_corpus_split(self) -> None: """Create split directory and output information from Jobs""" - os.makedirs(os.path.join(self.split_directory, "log"), exist_ok=True) + os.makedirs(self.split_directory.joinpath("log"), exist_ok=True) with self.session() as session, tqdm.tqdm( total=self.num_utterances, disable=GLOBAL_CONFIG.quiet ) as pbar: @@ -644,7 +651,7 @@ def normalize_text(self) -> None: if args is None: return logger.info("Normalizing text...") - log_directory = os.path.join(self.split_directory, "log") + log_directory = self.split_directory.joinpath("log") word_update_mappings = {} word_insert_mappings = {} pronunciation_insert_mappings = [] @@ -1026,8 +1033,8 @@ def create_subset(self, subset: int) -> None: Number of utterances to include in subset """ logger.info(f"Creating subset directory with {subset} utterances...") - subset_directory = os.path.join(self.corpus_output_directory, f"subset_{subset}") - log_dir = os.path.join(subset_directory, "log") + subset_directory = self.corpus_output_directory.joinpath(f"subset_{subset}") + log_dir = subset_directory.joinpath("log") os.makedirs(log_dir, exist_ok=True) num_dictionaries = getattr(self, "num_dictionaries", 1) with self.session() as session: @@ -1226,7 +1233,7 @@ def subset_directory(self, subset: typing.Optional[int]) -> str: session.commit() if subset is None or subset >= self.num_utterances or subset <= 0: return self.split_directory - directory = os.path.join(self.corpus_output_directory, f"subset_{subset}") + directory = self.corpus_output_directory.joinpath(f"subset_{subset}") if not os.path.exists(directory): self.create_subset(subset) return directory diff --git a/montreal_forced_aligner/corpus/features.py b/montreal_forced_aligner/corpus/features.py index 0a63b877..48ea68a6 100644 --- a/montreal_forced_aligner/corpus/features.py +++ b/montreal_forced_aligner/corpus/features.py @@ -9,6 +9,7 @@ import subprocess import typing from abc import abstractmethod +from pathlib import Path from typing import TYPE_CHECKING, Any, Dict, List, Union import dataclassy @@ -59,8 +60,8 @@ class VadArguments(MfaArguments): """Arguments for :class:`~montreal_forced_aligner.corpus.features.ComputeVadFunction`""" - feats_scp_path: str - vad_scp_path: str + feats_scp_path: Path + vad_scp_path: Path vad_options: MetaDict @@ -71,7 +72,7 @@ class MfccArguments(MfaArguments): Arguments for :class:`~montreal_forced_aligner.corpus.features.MfccFunction` """ - data_directory: str + data_directory: Path mfcc_options: MetaDict pitch_options: MetaDict @@ -83,7 +84,7 @@ class FinalFeatureArguments(MfaArguments): Arguments for :class:`~montreal_forced_aligner.corpus.features.FinalFeatureFunction` """ - data_directory: str + data_directory: Path uses_cmvn: bool voiced_only: bool subsample_feats: int @@ -96,7 +97,7 @@ class PitchArguments(MfaArguments): Arguments for :class:`~montreal_forced_aligner.corpus.features.MfccFunction` """ - data_directory: str + data_directory: Path pitch_options: MetaDict @@ -107,7 +108,7 @@ class PitchRangeArguments(MfaArguments): Arguments for :class:`~montreal_forced_aligner.corpus.features.MfccFunction` """ - data_directory: str + data_directory: Path pitch_options: MetaDict @@ -118,11 +119,11 @@ class CalcFmllrArguments(MfaArguments): dictionaries: List[str] feature_strings: Dict[str, str] - ali_paths: Dict[str, str] - ali_model_path: str - model_path: str - spk2utt_paths: Dict[str, str] - trans_paths: Dict[str, str] + ali_paths: Dict[str, Path] + ali_model_path: Path + model_path: Path + spk2utt_paths: Dict[str, Path] + trans_paths: Dict[str, Path] fmllr_options: MetaDict @@ -132,9 +133,9 @@ class ExtractIvectorsArguments(MfaArguments): """Arguments for :class:`~montreal_forced_aligner.corpus.features.ExtractIvectorsFunction`""" ivector_options: MetaDict - ie_path: str - ivectors_scp_path: str - dubm_path: str + ie_path: Path + ivectors_scp_path: Path + dubm_path: Path # noinspection PyUnresolvedReferences @@ -166,7 +167,7 @@ def feature_make_safe(value: Any) -> str: def compute_mfcc_process( log_file: io.FileIO, - wav_path: str, + wav_path: Path, segments: typing.Union[str, subprocess.Popen, subprocess.PIPE], mfcc_options: MetaDict, min_length=0.1, @@ -250,7 +251,7 @@ def compute_mfcc_process( def compute_pitch_process( log_file: io.FileIO, - wav_path: str, + wav_path: Path, segments: typing.Union[str, subprocess.Popen, subprocess.PIPE], pitch_options: MetaDict, min_length=0.1, @@ -361,10 +362,10 @@ def compute_pitch_process( def compute_transform_process( log_file: io.FileIO, - feat_proc: typing.Union[subprocess.Popen, str], - utt2spk_path: str, - lda_mat_path: typing.Optional[str], - fmllr_path: typing.Optional[str], + feat_proc: typing.Union[subprocess.Popen, Path], + utt2spk_path: Path, + lda_mat_path: typing.Optional[Path], + fmllr_path: typing.Optional[Path], lda_options: MetaDict, ) -> subprocess.Popen: """ @@ -376,13 +377,11 @@ def compute_transform_process( File for logging stderr feat_proc: subprocess.Popen Feature generation process - utt2spk_path: str + utt2spk_path: :class:`~pathlib.Path` Utterance to speaker SCP file path - cmvn_path: str - CMVN SCP file path - lda_mat_path: str + lda_mat_path: :class:`~pathlib.Path` LDA matrix file path - fmllr_path: str + fmllr_path: :class:`~pathlib.Path` fMLLR transform file path lda_options: dict[str, Any] Options for LDA @@ -888,7 +887,7 @@ def _run(self) -> typing.Generator[typing.Tuple[int, int, int]]: with mfa_open(self.log_path, "w") as log_file: feats_scp_path = self.feats_scp_path vad_scp_path = self.vad_scp_path - vad_ark_path = self.vad_scp_path.replace(".scp", ".ark") + vad_ark_path = self.vad_scp_path.with_suffix(".ark") vad_proc = subprocess.Popen( [ thirdparty_binary("compute-vad"), @@ -966,7 +965,7 @@ def _run(self) -> typing.Generator[str]: spk2utt_path = self.spk2utt_paths[dict_id] trans_path = self.trans_paths[dict_id] initial = True - if os.path.exists(trans_path): + if trans_path.exists(): initial = False post_proc = subprocess.Popen( [thirdparty_binary("ali-to-post"), f"ark,s,cs:{ali_path}", "ark:-"], @@ -990,7 +989,7 @@ def _run(self) -> typing.Generator[str]: env=os.environ, ) - temp_trans_path = trans_path + ".tmp" + temp_trans_path = trans_path.with_suffix(trans_path.suffix + ".tmp") if self.ali_model_path != self.model_path: post_gpost_proc = subprocess.Popen( [ @@ -1025,7 +1024,7 @@ def _run(self) -> typing.Generator[str]: else: if not initial: - temp_composed_trans_path = trans_path + ".cmp.tmp" + temp_composed_trans_path = trans_path.with_suffix(".cmp.tmp") est_proc = subprocess.Popen( [ thirdparty_binary("gmm-est-fmllr"), @@ -1247,7 +1246,7 @@ def model_path(self) -> str: # needed for fmllr @property @abstractmethod - def working_directory(self) -> str: + def working_directory(self) -> Path: """Abstract method for working directory""" ... @@ -1522,7 +1521,7 @@ def _run(self) -> typing.Generator[str]: stderr=log_file, env=os.environ, ) - ivector_ark_path = self.ivectors_scp_path.replace(".scp", ".ark") + ivector_ark_path = self.ivectors_scp_path.with_suffix(".ark") extract_proc = subprocess.Popen( [ thirdparty_binary("ivector-extract"), @@ -1886,13 +1885,13 @@ class PldaModel: transformed_diagonalizing_transform: typing.Optional[np.ndarray] = None @classmethod - def load(cls, plda_path): + def load(cls, plda_path: Path): """ Instantiate a PLDA model from a trained model file Parameters ---------- - plda_path: str + plda_path: :class:`~pathlib.Path` Path to trained PLDA model Returns diff --git a/montreal_forced_aligner/corpus/ivector_corpus.py b/montreal_forced_aligner/corpus/ivector_corpus.py index b738ea63..125bed20 100644 --- a/montreal_forced_aligner/corpus/ivector_corpus.py +++ b/montreal_forced_aligner/corpus/ivector_corpus.py @@ -49,12 +49,12 @@ def __init__(self, **kwargs): @property def ie_path(self) -> str: """Ivector extractor ie path""" - return os.path.join(self.working_directory, "final.ie") + return self.working_directory.joinpath("final.ie") @property def dubm_path(self) -> str: """DUBM model path""" - return os.path.join(self.working_directory, "final.dubm") + return self.working_directory.joinpath("final.dubm") def extract_ivectors_arguments(self) -> List[ExtractIvectorsArguments]: """ @@ -71,7 +71,7 @@ def extract_ivectors_arguments(self) -> List[ExtractIvectorsArguments]: ExtractIvectorsArguments( j.id, getattr(self, "db_string", ""), - os.path.join(self.working_log_directory, f"extract_ivectors.{j.id}.log"), + self.working_log_directory.joinpath(f"extract_ivectors.{j.id}.log"), self.ivector_options, self.ie_path, j.construct_path(self.split_directory, "ivectors", "scp"), @@ -89,19 +89,19 @@ def utterance_ivector_path(self) -> str: @property def adapted_plda_path(self) -> str: """Path to adapted PLDA model""" - return os.path.join(self.working_directory, "plda_adapted") + return self.working_directory.joinpath("plda_adapted") @property def plda_path(self) -> str: """Path to trained PLDA model""" - return os.path.join(self.working_directory, "plda") + return self.working_directory.joinpath("plda") def adapt_plda(self) -> None: """Adapted a trained PLDA model with new ivectors""" if not os.path.exists(self.utterance_ivector_path): self.extract_ivectors() - log_path = os.path.join(self.working_log_directory, "adapt_plda.log") + log_path = self.working_log_directory.joinpath("adapt_plda.log") with mfa_open(log_path, "w") as log_file: proc = subprocess.Popen( [ @@ -124,8 +124,8 @@ def compute_speaker_ivectors(self) -> None: self._write_spk2utt() spk2utt_path = os.path.join(self.corpus_output_directory, "spk2utt.scp") - log_path = os.path.join(self.working_log_directory, "speaker_ivectors.log") - num_utts_path = os.path.join(self.working_directory, "current_num_utts.ark") + log_path = self.working_log_directory.joinpath("speaker_ivectors.log") + num_utts_path = self.working_directory.joinpath("current_num_utts.ark") logger.info("Computing speaker ivectors...") self.stopped.reset() if self.stopped.stop_check(): @@ -180,8 +180,8 @@ def compute_plda(self) -> None: self._write_spk2utt() spk2utt_path = os.path.join(self.corpus_output_directory, "spk2utt.scp") - plda_path = os.path.join(self.working_directory, "plda") - log_path = os.path.join(self.working_log_directory, "plda.log") + plda_path = self.working_directory.joinpath("plda") + log_path = self.working_log_directory.joinpath("plda.log") logger.info("Computing PLDA...") self.stopped.reset() if self.stopped.stop_check(): @@ -269,7 +269,7 @@ def extract_ivectors(self) -> None: logger.debug(f"Ivector extraction took {time.time() - begin:.3f} seconds") def transform_ivectors(self): - plda_transform_path = os.path.join(self.working_directory, "plda.pkl") + plda_transform_path = self.working_directory.joinpath("plda.pkl") if os.path.exists(plda_transform_path): with open(plda_transform_path, "rb") as f: self.plda = pickle.load(f) @@ -379,7 +379,7 @@ def collect_speaker_ivectors(self) -> None: speaker_ivector_ark_path = os.path.join( self.working_directory, "current_speaker_ivectors.ark" ) - num_utts_path = os.path.join(self.working_directory, "current_num_utts.ark") + num_utts_path = self.working_directory.joinpath("current_num_utts.ark") if not os.path.exists(speaker_ivector_ark_path): self.compute_speaker_ivectors() with self.session() as session, tqdm.tqdm( diff --git a/montreal_forced_aligner/corpus/multiprocessing.py b/montreal_forced_aligner/corpus/multiprocessing.py index c375f110..22bf3a24 100644 --- a/montreal_forced_aligner/corpus/multiprocessing.py +++ b/montreal_forced_aligner/corpus/multiprocessing.py @@ -8,6 +8,7 @@ import os import re import typing +from pathlib import Path from queue import Empty, Queue import sqlalchemy @@ -249,14 +250,6 @@ class NormalizeTextArguments(MfaArguments): """ Arguments for :class:`~montreal_forced_aligner.corpus.multiprocessing.NormalizeTextFunction` - Parameters - ---------- - model_path: str - Path to model file - phone_pdf_counts_path: str - Path to output PDF counts - feature_strings: dict[int, str] - Mapping of dictionaries to feature generation strings """ word_break_markers: typing.List[str] @@ -281,7 +274,7 @@ class ExportKaldiFilesArguments(MfaArguments): """ - split_directory: str + split_directory: Path for_features: bool @@ -495,12 +488,24 @@ def _no_dictionary_sanitize(self, session): .filter(Utterance.job_id == self.job_name) ) for u_id, u_text in utterances: - text = " ".join(sanitize_function(u_text)) - oovs = set() + text = [] + character_text = [] + for w in sanitize_function(u_text): + text.append(w) + if character_text: + character_text.append("") + if self.bracket_regex.match(w): + character_text.append(self.bracketed_word) + else: + for g in w: + character_text.append(g) + text = " ".join(text) + character_text = " ".join(character_text) yield { "id": u_id, - "oovs": " ".join(sorted(oovs)), + "oovs": "", "normalized_text": text, + "normalized_character_text": character_text, }, None def _run(self) -> typing.Generator[typing.Tuple[int, float]]: diff --git a/montreal_forced_aligner/corpus/text_corpus.py b/montreal_forced_aligner/corpus/text_corpus.py index 8e630bba..e51c91f1 100644 --- a/montreal_forced_aligner/corpus/text_corpus.py +++ b/montreal_forced_aligner/corpus/text_corpus.py @@ -6,6 +6,7 @@ import os import sys import time +from pathlib import Path from queue import Empty import tqdm @@ -149,7 +150,6 @@ def _load_corpus_from_source_mp(self) -> None: logger.info("Detected ctrl-c, please wait a moment while we clean everything up...") self.stopped.stop() finished_adding.stop() - job_queue.join() self.stopped.set_sigint_source() while True: try: @@ -306,12 +306,12 @@ def identifier(self) -> str: return self.data_source_identifier @property - def output_directory(self) -> str: + def output_directory(self) -> Path: """Root temporary directory to store all corpus and dictionary files""" - return os.path.join(GLOBAL_CONFIG.temporary_directory, self.identifier) + return GLOBAL_CONFIG.current_profile.temporary_directory.joinpath(self.identifier) @property - def working_directory(self) -> str: + def working_directory(self) -> Path: """Working directory""" return self.corpus_output_directory @@ -341,11 +341,11 @@ def identifier(self) -> str: return self.data_source_identifier @property - def output_directory(self) -> str: + def output_directory(self) -> Path: """Root temporary directory to store all corpus and dictionary files""" - return os.path.join(GLOBAL_CONFIG.temporary_directory, self.identifier) + return GLOBAL_CONFIG.current_profile.temporary_directory.joinpath(self.identifier) @property - def working_directory(self) -> str: + def working_directory(self) -> Path: """Working directory""" return self.corpus_output_directory diff --git a/montreal_forced_aligner/data.py b/montreal_forced_aligner/data.py index 7ffa305a..e3a39a02 100644 --- a/montreal_forced_aligner/data.py +++ b/montreal_forced_aligner/data.py @@ -12,6 +12,7 @@ import math import re import typing +from pathlib import Path import dataclassy import pynini @@ -95,13 +96,13 @@ class MfaArguments: Integer ID of the job db_string: str String for database connections - log_path: str + log_path: :class:`~pathlib.Path` Path to save logging information during the run """ job_name: int db_string: str - log_path: str + log_path: Path class TextFileType(enum.Enum): @@ -228,6 +229,7 @@ class WorkflowType(enum.Enum): train_g2p = 10 g2p = 11 language_model_training = 12 + tokenizer_training = 13 class WordType(enum.Enum): diff --git a/montreal_forced_aligner/db.py b/montreal_forced_aligner/db.py index 62b8e34c..14cd87d9 100644 --- a/montreal_forced_aligner/db.py +++ b/montreal_forced_aligner/db.py @@ -222,19 +222,19 @@ class Corpus(MfaSqlBase): num_jobs = Column(Integer, default=0) current_subset = Column(Integer, default=0) - data_directory = Column(String, nullable=False) + data_directory = Column(PathType, nullable=False) jobs = relationship("Job", back_populates="corpus") @property def split_directory(self): - return os.path.join(self.data_directory, f"split{self.num_jobs}") + return self.data_directory.joinpath(f"split{self.num_jobs}") @property def current_subset_directory(self): if not self.current_subset: return self.split_directory - return os.path.join(self.data_directory, f"subset_{self.current_subset}") + return self.data_directory.joinpath(f"subset_{self.current_subset}") @property def speaker_ivector_column(self): @@ -286,7 +286,7 @@ class Dictionary(MfaSqlBase): Dictionary name dialect: str Dialect of dictionary if dictionary name is in MFA format - path: str + path: :class:`~pathlib.Path` Path to the dictionary phone_set_type: :class:`~montreal_forced_aligner.data.PhoneSetType` Phone set @@ -388,85 +388,85 @@ def clitic_set(self) -> typing.Set[str]: return {x.word for x in self.words if x.word_type is WordType.clitic} @property - def word_boundary_int_path(self) -> str: + def word_boundary_int_path(self) -> Path: """Path to the word boundary integer IDs""" - return os.path.join(self.phones_directory, "word_boundary.int") + return self.phones_directory.joinpath("word_boundary.int") @property - def disambiguation_symbols_int_path(self) -> str: + def disambiguation_symbols_int_path(self) -> Path: """Path to the word boundary integer IDs""" - return os.path.join(self.phones_directory, "disambiguation_symbols.int") + return self.phones_directory.joinpath("disambiguation_symbols.int") @property - def phones_directory(self) -> str: + def phones_directory(self) -> Path: """ Phones directory """ - return os.path.join(str(self.root_temp_directory), "phones") + return self.root_temp_directory.joinpath("phones") @property - def phone_symbol_table_path(self) -> str: + def phone_symbol_table_path(self) -> Path: """Path to file containing phone symbols and their integer IDs""" - return os.path.join(self.phones_directory, "phones.txt") + return self.phones_directory.joinpath("phones.txt") @property - def grapheme_symbol_table_path(self) -> str: + def grapheme_symbol_table_path(self) -> Path: """Path to file containing grapheme symbols and their integer IDs""" - return os.path.join(self.phones_directory, "graphemes.txt") + return self.phones_directory.joinpath("graphemes.txt") @property - def phone_disambig_path(self) -> str: + def phone_disambig_path(self) -> Path: """Path to file containing phone symbols and their integer IDs""" - return os.path.join(self.phones_directory, "phone_disambig.txt") + return self.phones_directory.joinpath("phone_disambig.txt") @property - def temp_directory(self) -> str: + def temp_directory(self) -> Path: """ Path of disambiguated lexicon fst (L.fst) """ - return os.path.join(str(self.root_temp_directory), f"{self.id}_{self.name}") + return self.root_temp_directory.joinpath(f"{self.id}_{self.name}") @property - def lexicon_disambig_fst_path(self) -> str: + def lexicon_disambig_fst_path(self) -> Path: """ Path of disambiguated lexicon fst (L.fst) """ - return os.path.join(self.temp_directory, "L.disambig_fst") + return self.temp_directory.joinpath("L.disambig_fst") @property - def align_lexicon_path(self) -> str: + def align_lexicon_path(self) -> Path: """ Path of lexicon file to use for aligning lattices """ - return os.path.join(self.temp_directory, "align_lexicon.fst") + return self.temp_directory.joinpath("align_lexicon.fst") @property - def align_lexicon_disambig_path(self) -> str: + def align_lexicon_disambig_path(self) -> Path: """ Path of lexicon file to use for aligning lattices """ - return os.path.join(self.temp_directory, "align_lexicon.disambig_fst") + return self.temp_directory.joinpath("align_lexicon.disambig_fst") @property - def align_lexicon_int_path(self) -> str: + def align_lexicon_int_path(self) -> Path: """ Path of lexicon file to use for aligning lattices """ - return os.path.join(self.temp_directory, "align_lexicon.int") + return self.temp_directory.joinpath("align_lexicon.int") @property - def lexicon_fst_path(self) -> str: + def lexicon_fst_path(self) -> Path: """ Path of disambiguated lexicon fst (L.fst) """ - return os.path.join(self.temp_directory, "L.fst") + return self.temp_directory.joinpath("L.fst") @property - def words_symbol_path(self) -> str: + def words_symbol_path(self) -> Path: """ Path of word to int mapping file for the dictionary """ - return os.path.join(self.temp_directory, "words.txt") + return self.temp_directory.joinpath("words.txt") @property def data_source_identifier(self) -> str: @@ -909,7 +909,7 @@ class File(MfaSqlBase): Primary key name: str Base name of the file - relative_path: str + relative_path: :class:`~pathlib.Path` Path of the file relative to the root corpus directory modified: bool Flag for whether the file has been changed in the database for exporting @@ -1120,7 +1120,7 @@ class SoundFile(MfaSqlBase): Foreign key to :class:`~montreal_forced_aligner.db.File` file: :class:`~montreal_forced_aligner.db.File` Root file - sound_file_path: str + sound_file_path: :class:`~pathlib.Path` Path to the audio file format: str Format of the audio file (flac, wav, mp3, etc) @@ -1192,7 +1192,7 @@ class TextFile(MfaSqlBase): Foreign key to :class:`~montreal_forced_aligner.db.File` file: :class:`~montreal_forced_aligner.db.File` Root file - text_file_path: str + text_file_path: :class:`~pathlib.Path` Path to the transcription file file_type: str Type of the transcription file (lab, TextGrid, etc) @@ -1549,7 +1549,7 @@ class CorpusWorkflow(MfaSqlBase): id = Column(Integer, primary_key=True, autoincrement=True) name = Column(String, unique=True, index=True) workflow_type = Column(Enum(WorkflowType), nullable=False, index=True) - working_directory = Column(String, nullable=False) + working_directory = Column(PathType, nullable=False) time_stamp = Column(DateTime, nullable=False, server_default=sqlalchemy.func.now(), index=True) current = Column(Boolean, nullable=False, default=False, index=True) done = Column(Boolean, nullable=False, default=False, index=True) @@ -1575,7 +1575,7 @@ class CorpusWorkflow(MfaSqlBase): @property def lda_mat_path(self) -> str: - return os.path.join(self.working_directory, "lda.mat") + return self.working_directory.joinpath("lda.mat") class PhoneInterval(MfaSqlBase): @@ -1852,23 +1852,23 @@ def dictionary_ids(self) -> typing.List[int]: return [x.id for x in self.dictionaries] @property - def wav_scp_path(self): + def wav_scp_path(self) -> Path: return self.construct_path(self.corpus.split_directory, "wav", "scp") @property - def segments_scp_path(self): + def segments_scp_path(self) -> Path: return self.construct_path(self.corpus.split_directory, "segments", "scp") @property - def feats_scp_path(self): + def feats_scp_path(self) -> Path: return self.construct_path(self.corpus.split_directory, "feats", "scp") @property - def feats_ark_path(self): + def feats_ark_path(self) -> Path: return self.construct_path(self.corpus.split_directory, "feats", "ark") @property - def per_dictionary_feats_scp_paths(self): + def per_dictionary_feats_scp_paths(self) -> typing.Dict[int, Path]: paths = {} for d in self.dictionaries: paths[d.id] = self.construct_path( @@ -1877,7 +1877,7 @@ def per_dictionary_feats_scp_paths(self): return paths @property - def per_dictionary_utt2spk_scp_paths(self): + def per_dictionary_utt2spk_scp_paths(self) -> typing.Dict[int, Path]: paths = {} for d in self.dictionaries: paths[d.id] = self.construct_path( @@ -1886,7 +1886,7 @@ def per_dictionary_utt2spk_scp_paths(self): return paths @property - def per_dictionary_spk2utt_scp_paths(self): + def per_dictionary_spk2utt_scp_paths(self) -> typing.Dict[int, Path]: paths = {} for d in self.dictionaries: paths[d.id] = self.construct_path( @@ -1895,7 +1895,7 @@ def per_dictionary_spk2utt_scp_paths(self): return paths @property - def per_dictionary_cmvn_scp_paths(self): + def per_dictionary_cmvn_scp_paths(self) -> typing.Dict[int, Path]: paths = {} for d in self.dictionaries: paths[d.id] = self.construct_path( @@ -1904,7 +1904,7 @@ def per_dictionary_cmvn_scp_paths(self): return paths @property - def per_dictionary_text_int_scp_paths(self): + def per_dictionary_text_int_scp_paths(self) -> typing.Dict[int, Path]: paths = {} for d in self.dictionaries: paths[d.id] = self.construct_path( @@ -1913,8 +1913,8 @@ def per_dictionary_text_int_scp_paths(self): return paths def construct_path( - self, directory: str, identifier: str, extension: str, dictionary_id: int = None - ) -> str: + self, directory: Path, identifier: str, extension: str, dictionary_id: int = None + ) -> Path: """ Helper function for constructing dictionary-dependent paths for the Job @@ -1929,28 +1929,28 @@ def construct_path( Returns ------- - str + Path Path """ if dictionary_id is None: - return os.path.join(directory, f"{identifier}.{self.id}.{extension}") - return os.path.join(directory, f"{identifier}.{dictionary_id}.{self.id}.{extension}") + return directory.joinpath(f"{identifier}.{self.id}.{extension}") + return directory.joinpath(f"{identifier}.{dictionary_id}.{self.id}.{extension}") - def construct_path_dictionary(self, directory: str, identifier: str, extension: str): + def construct_path_dictionary(self, directory: Path, identifier: str, extension: str): paths = {} for d_id in self.dictionary_ids: paths[d_id] = self.construct_path(directory, identifier, extension, d_id) return paths def construct_dictionary_dependent_paths( - self, directory: str, identifier: str, extension: str - ) -> typing.Dict[int, str]: + self, directory: Path, identifier: str, extension: str + ) -> typing.Dict[int, Path]: """ Helper function for constructing paths that depend only on the dictionaries of the job, and not the job name itself. These paths should be merged with all other jobs to get a full set of dictionary paths. Parameters ---------- - directory: str + directory: :class:`~pathlib.Path` Directory to use as the root identifier: str Identifier for the path name, like ali or acc @@ -1958,12 +1958,12 @@ def construct_dictionary_dependent_paths( Extension of the path, like .scp or .ark Returns ------- - dict[int, str] + dict[int, Path] Path for each dictionary """ output = {} for dict_id in self.dictionary_ids: - output[dict_id] = os.path.join(directory, f"{identifier}.{dict_id}.{extension}") + output[dict_id] = directory.joinpath(f"{identifier}.{dict_id}.{extension}") return output def construct_online_feature_proc_string(self): diff --git a/montreal_forced_aligner/diarization/multiprocessing.py b/montreal_forced_aligner/diarization/multiprocessing.py index 184d9b20..26660d3e 100644 --- a/montreal_forced_aligner/diarization/multiprocessing.py +++ b/montreal_forced_aligner/diarization/multiprocessing.py @@ -9,6 +9,7 @@ import sys import time import typing +from pathlib import Path import dataclassy import hdbscan @@ -74,8 +75,8 @@ class PldaClassificationArguments(MfaArguments): """Arguments for :class:`~montreal_forced_aligner.diarization.multiprocessing.PldaClassificationFunction`""" plda: PldaModel - train_ivector_path: str - num_utts_path: str + train_ivector_path: Path + num_utts_path: Path use_xvector: bool diff --git a/montreal_forced_aligner/diarization/speaker_diarizer.py b/montreal_forced_aligner/diarization/speaker_diarizer.py index 3680c38f..4003a9da 100644 --- a/montreal_forced_aligner/diarization/speaker_diarizer.py +++ b/montreal_forced_aligner/diarization/speaker_diarizer.py @@ -15,6 +15,7 @@ import sys import time import typing +from pathlib import Path from typing import TYPE_CHECKING, Any, Dict, List, Optional import numpy as np @@ -125,7 +126,7 @@ class SpeakerDiarizer(IvectorCorpusMixin, TopLevelMfaWorker, FileExporterMixin): def __init__( self, - ivector_extractor_path: str = "speechbrain", + ivector_extractor_path: typing.Union[str, Path] = "speechbrain", expected_num_speakers: int = 0, cluster: bool = True, evaluation_mode: bool = False, @@ -185,7 +186,7 @@ def __init__( @classmethod def parse_parameters( cls, - config_path: Optional[str] = None, + config_path: Optional[Path] = None, args: Optional[Dict[str, Any]] = None, unknown_args: Optional[List[str]] = None, ) -> MetaDict: @@ -194,7 +195,7 @@ def parse_parameters( Parameters ---------- - config_path: str + config_path: :class:`~pathlib.Path` Config path args: dict[str, Any] Parsed arguments @@ -240,7 +241,7 @@ def setup(self) -> None: if wf.done: logger.info("Diarization already done, skipping initialization.") return - log_dir = os.path.join(self.working_directory, "log") + log_dir = self.working_directory.joinpath("log") os.makedirs(log_dir, exist_ok=True) try: if self.ivector_extractor is None: # Download models if needed @@ -308,7 +309,7 @@ def plda_classification_arguments(self) -> List[PldaClassificationArguments]: PldaClassificationArguments( j.id, getattr(self, "db_string", ""), - os.path.join(self.working_log_directory, f"plda_classification.{j.id}.log"), + self.working_log_directory.joinpath(f"plda_classification.{j.id}.log"), self.plda, self.speaker_ivector_path, self.num_utts_path, @@ -325,7 +326,7 @@ def classify_speakers(self): with self.session() as session, tqdm.tqdm( total=self.num_utterances, disable=GLOBAL_CONFIG.quiet ) as pbar, mfa_open( - os.path.join(self.working_directory, "speaker_classification_results.csv"), "w" + self.working_directory.joinpath("speaker_classification_results.csv"), "w" ) as f: writer = csv.DictWriter(f, ["utt_id", "file", "begin", "end", "speaker", "score"]) @@ -359,7 +360,7 @@ def classify_speakers(self): ] func = SpeechbrainClassificationFunction else: - plda_transform_path = os.path.join(self.working_directory, "plda.pkl") + plda_transform_path = self.working_directory.joinpath("plda.pkl") with open(plda_transform_path, "rb") as f: self.plda: PldaModel = pickle.load(f) arguments = self.plda_classification_arguments() @@ -440,7 +441,7 @@ def evaluate_clustering(self) -> None: """Compute clustering metric scores and output clustering evaluation results""" label_to_ground_truth_mapping = self.map_speakers_to_ground_truth() with self.session() as session, mfa_open( - os.path.join(self.working_directory, "diarization_evaluation_results.csv"), "w" + self.working_directory.joinpath("diarization_evaluation_results.csv"), "w" ) as f: writer = csv.DictWriter( @@ -503,7 +504,7 @@ def evaluate_classification(self) -> None: """Evaluate and output classification accuracy""" label_to_ground_truth_mapping = self.map_speakers_to_ground_truth() with self.session() as session, mfa_open( - os.path.join(self.working_directory, "diarization_evaluation_results.csv"), "w" + self.working_directory.joinpath("diarization_evaluation_results.csv"), "w" ) as f: writer = csv.DictWriter( f, @@ -563,12 +564,12 @@ def evaluate_classification(self) -> None: @property def num_utts_path(self) -> str: """Path to archive containing number of per training speaker""" - return os.path.join(self.working_directory, "num_utts.ark") + return self.working_directory.joinpath("num_utts.ark") @property def speaker_ivector_path(self) -> str: """Path to archive containing training speaker ivectors""" - return os.path.join(self.working_directory, "speaker_ivectors.ark") + return self.working_directory.joinpath("speaker_ivectors.ark") def visualize_clusters(self, ivectors, cluster_labels=None): import seaborn as sns @@ -621,7 +622,7 @@ def visualize_clusters(self, ivectors, cluster_labels=None): shadow=True, ncol=5, ) - plot_path = os.path.join(self.working_directory, "cluster_plot.png") + plot_path = self.working_directory.joinpath("cluster_plot.png") plt.savefig(plot_path, bbox_extra_artists=(lgd,), bbox_inches="tight", transparent=True) if GLOBAL_CONFIG.current_profile.verbose: plt.show(block=False) @@ -693,7 +694,7 @@ def initialize_mfa_clustering(self): score_threshold = self.initial_sb_score_threshold self.export_xvectors() else: - plda_transform_path = os.path.join(self.working_directory, "plda.pkl") + plda_transform_path = self.working_directory.joinpath("plda.pkl") with open(plda_transform_path, "rb") as f: self.plda: PldaModel = pickle.load(f) arguments = self.plda_classification_arguments() @@ -815,7 +816,7 @@ def classify_iteration(self, iteration=None) -> None: utterance_mapping = [] self.classification_score = 0 - plda_transform_path = os.path.join(self.working_directory, "plda.pkl") + plda_transform_path = self.working_directory.joinpath("plda.pkl") with open(plda_transform_path, "rb") as f: self.plda: PldaModel = pickle.load(f) arguments = self.plda_classification_arguments() @@ -1092,7 +1093,7 @@ def cluster_utterances(self) -> None: os.environ["OPENBLAS_NUM_THREADS"] = f"{GLOBAL_CONFIG.current_profile.num_jobs}" os.environ["MKL_NUM_THREADS"] = f"{GLOBAL_CONFIG.current_profile.num_jobs}" if self.metric is DistanceMetric.plda: - plda_transform_path = os.path.join(self.working_directory, "plda.pkl") + plda_transform_path = self.working_directory.joinpath("plda.pkl") with open(plda_transform_path, "rb") as f: self.plda: PldaModel = pickle.load(f) if self.evaluation_mode and GLOBAL_CONFIG.current_profile.debug: @@ -1383,7 +1384,7 @@ def refresh_plda_vectors(self): update_mapping.append({"id": utt_id, "plda_vector": ivectors[i, :]}) bulk_update(session, Utterance, update_mapping) session.commit() - plda_transform_path = os.path.join(self.working_directory, "plda.pkl") + plda_transform_path = self.working_directory.joinpath("plda.pkl") with open(plda_transform_path, "wb") as f: pickle.dump(self.plda, f) @@ -1460,7 +1461,7 @@ def export_files(self, output_directory: str) -> None: Output directory to save files """ if not self.overwrite and os.path.exists(output_directory): - output_directory = os.path.join(self.working_directory, "speaker_classification") + output_directory = self.working_directory.joinpath("speaker_classification") os.makedirs(output_directory, exist_ok=True) diagnostic_files = [ "diarization_evaluation_results.csv", @@ -1468,14 +1469,14 @@ def export_files(self, output_directory: str) -> None: "nearest_neighbors.png", ] for fname in diagnostic_files: - path = os.path.join(self.working_directory, fname) + path = self.working_directory.joinpath(fname) if os.path.exists(path): shutil.copyfile( path, os.path.join(output_directory, fname), ) with mfa_open(os.path.join(output_directory, "parameters.yaml"), "w") as f: - yaml.safe_dump( + yaml.dump( { "ivector_extractor_path": str(self.ivector_extractor_path), "expected_num_speakers": self.expected_num_speakers, @@ -1488,6 +1489,7 @@ def export_files(self, output_directory: str) -> None: "linkage": self.linkage, }, f, + Dumper=yaml.Dumper, ) with self.session() as session: diff --git a/montreal_forced_aligner/dictionary/mixins.py b/montreal_forced_aligner/dictionary/mixins.py index d6a6580a..a1a1070a 100644 --- a/montreal_forced_aligner/dictionary/mixins.py +++ b/montreal_forced_aligner/dictionary/mixins.py @@ -7,6 +7,7 @@ import re import typing from collections import Counter +from pathlib import Path from typing import TYPE_CHECKING, Dict, List, Optional, Set, Tuple from montreal_forced_aligner.abc import DatabaseMixin @@ -839,9 +840,9 @@ def __init__(self, **kwargs): self._lexicon_fst_paths = {} @property - def word_boundary_int_path(self) -> str: + def word_boundary_int_path(self) -> Path: """Path to the word boundary integer IDs""" - return os.path.join(self.dictionary_output_directory, "phones", "word_boundary.int") + return self.dictionary_output_directory.joinpath("phones", "word_boundary.int") def _write_word_boundaries(self) -> None: """ @@ -999,11 +1000,11 @@ def _write_phone_sets(self) -> None: Write phone symbol sets to the temporary directory """ - sets_file = os.path.join(self.dictionary_output_directory, "phones", "sets.txt") - roots_file = os.path.join(self.dictionary_output_directory, "phones", "roots.txt") + sets_file = self.dictionary_output_directory.joinpath("phones", "sets.txt") + roots_file = self.dictionary_output_directory.joinpath("phones", "roots.txt") - sets_int_file = os.path.join(self.dictionary_output_directory, "phones", "sets.int") - roots_int_file = os.path.join(self.dictionary_output_directory, "phones", "roots.int") + sets_int_file = self.dictionary_output_directory.joinpath("phones", "sets.int") + roots_int_file = self.dictionary_output_directory.joinpath("phones", "roots.int") with mfa_open(sets_file, "w") as setf, mfa_open(roots_file, "w") as rootf, mfa_open( sets_int_file, "w" @@ -1043,40 +1044,40 @@ def _write_phone_sets(self) -> None: rootintf.write(f"shared split {phone_int_string}\n") @property - def phone_symbol_table_path(self) -> str: + def phone_symbol_table_path(self) -> Path: """Path to file containing phone symbols and their integer IDs""" - return os.path.join(self.phones_dir, "phones.txt") + return self.phones_dir.joinpath("phones.txt") @property - def grapheme_symbol_table_path(self) -> str: + def grapheme_symbol_table_path(self) -> Path: """Path to file containing grapheme symbols and their integer IDs""" - return os.path.join(self.phones_dir, "graphemes.txt") + return self.phones_dir.joinpath("graphemes.txt") @property - def disambiguation_symbols_txt_path(self) -> str: + def disambiguation_symbols_txt_path(self) -> Path: """Path to the file containing phone disambiguation symbols""" - return os.path.join(self.phones_dir, "disambiguation_symbols.txt") + return self.phones_dir.joinpath("disambiguation_symbols.txt") @property - def disambiguation_symbols_int_path(self) -> str: + def disambiguation_symbols_int_path(self) -> Path: """Path to the file containing integer IDs for phone disambiguation symbols""" if self._disambiguation_symbols_int_path is None: - self._disambiguation_symbols_int_path = os.path.join( - self.phones_dir, "disambiguation_symbols.int" + self._disambiguation_symbols_int_path = self.phones_dir.joinpath( + "disambiguation_symbols.int" ) return self._disambiguation_symbols_int_path @property - def phones_dir(self) -> str: + def phones_dir(self) -> Path: """Directory for storing phone information""" if self._phones_dir is None: - self._phones_dir = os.path.join(self.dictionary_output_directory, "phones") + self._phones_dir = self.dictionary_output_directory.joinpath("phones") return self._phones_dir @property - def topo_path(self) -> str: + def topo_path(self) -> Path: """Path to the dictionary's topology file""" - return os.path.join(self.phones_dir, "topo") + return self.phones_dir.joinpath("topo") def _write_disambig(self) -> None: """ @@ -1093,6 +1094,6 @@ def _write_disambig(self) -> None: for p_id, p in disambiguation_symbols: outf.write(f"{p}\n") intf.write(f"{p_id}\n") - phone_disambig_path = os.path.join(self.phones_dir, "phone_disambig.txt") + phone_disambig_path = self.phones_dir.joinpath("phone_disambig.txt") with mfa_open(phone_disambig_path, "w") as f: f.write(str(self.phone_mapping["#0"])) diff --git a/montreal_forced_aligner/dictionary/multispeaker.py b/montreal_forced_aligner/dictionary/multispeaker.py index f46602a8..0ddcd148 100644 --- a/montreal_forced_aligner/dictionary/multispeaker.py +++ b/montreal_forced_aligner/dictionary/multispeaker.py @@ -118,7 +118,7 @@ def load_phone_groups(self) -> None: """ if self.phone_groups_path is not None and self.phone_groups_path.exists(): with mfa_open(self.phone_groups_path) as f: - self._phone_groups = yaml.safe_load(f) + self._phone_groups = yaml.load(f, Loader=yaml.Loader) if isinstance(self._phone_groups, list): self._phone_groups = {k: v for k, v in enumerate(self._phone_groups)} for k, v in self._phone_groups.items(): @@ -614,7 +614,7 @@ def apply_phonological_rules(self) -> None: if not self.rules_path or not self.rules_path.exists(): return with mfa_open(self.rules_path) as f: - rule_data = yaml.safe_load(f) + rule_data = yaml.load(f, Loader=yaml.Loader) with self.session() as session: num_words = session.query(Word).count() logger.info("Applying phonological rules...") @@ -803,7 +803,7 @@ def _write_probabilistic_fst_text( session: sqlalchemy.orm.session.Session, dictionary: Dictionary, silence_disambiguation_symbol=None, - path: typing.Optional[str] = None, + path: typing.Optional[Path] = None, alignment: bool = False, ) -> None: """ @@ -817,7 +817,7 @@ def _write_probabilistic_fst_text( Dictionary for generating L.fst silence_disambiguation_symbol: str, optional Symbol to use for disambiguating silence for L_disambig.fst - path: str, optional + path: :class:`~pathlib.Path`, optional Full path to write L.fst to alignment: bool Flag for whether the FST will be used to align lattices @@ -828,9 +828,9 @@ def _write_probabilistic_fst_text( base_ext = ".disambig_text_fst" disambiguation = True if path is not None: - path = path.replace(".fst", base_ext) + path = path.with_suffix(base_ext) else: - path = os.path.join(dictionary.temp_directory, "lexicon" + base_ext) + path = dictionary.temp_directory.joinpath("lexicon" + base_ext) start_state = 0 non_silence_state = 1 # Also loop state silence_state = 2 @@ -1230,12 +1230,12 @@ def export_trained_rules(self, output_directory: str) -> None: dialectal_rules["dialects"][dialect] = [] dialectal_rules["dialects"][dialect].append(d) with mfa_open(output_rules_path, "w") as f: - yaml.safe_dump(dict(dialectal_rules), f, allow_unicode=True) + yaml.dump(dict(dialectal_rules), f, Dumper=yaml.Dumper, allow_unicode=True) def export_lexicon( self, dictionary_id: int, - path: str, + path: Path, write_disambiguation: typing.Optional[bool] = False, probability: typing.Optional[bool] = False, ) -> None: @@ -1244,7 +1244,7 @@ def export_lexicon( Parameters ---------- - path: str + path: :class:`~pathlib.Path` Path to save dictionary write_disambiguation: bool, optional Flag for whether to include disambiguation information @@ -1307,7 +1307,7 @@ def _write_fst_binary( self, dictionary: Dictionary, write_disambiguation: bool = False, - path: typing.Optional[str] = None, + path: typing.Optional[Path] = None, ) -> None: """ Write the binary fst file to the temporary directory @@ -1327,27 +1327,27 @@ def _write_fst_binary( Dictionary object write_disambiguation: bool, optional Flag for including disambiguation symbols - path: str, optional + path: :class:`~pathlib.Path`, optional Full path to write compiled L.fst to """ text_ext = ".text_fst" binary_ext = ".fst" - word_disambig_path = os.path.join(dictionary.temp_directory, "word_disambig.txt") + word_disambig_path = dictionary.temp_directory.joinpath("word_disambig.txt") with mfa_open(word_disambig_path, "w") as f: f.write(str(self.word_mapping(dictionary.id)["#0"])) if write_disambiguation: text_ext = ".disambig_text_fst" binary_ext = ".disambig_fst" if path is not None: - text_path = path.replace(".fst", text_ext) - binary_path = path.replace(".fst", binary_ext) + text_path = path.with_suffix(text_ext) + binary_path = path.with_suffix(binary_ext) else: - text_path = os.path.join(dictionary.temp_directory, "lexicon" + text_ext) - binary_path = os.path.join(dictionary.temp_directory, "L" + binary_ext) + text_path = dictionary.temp_directory.joinpath("lexicon" + text_ext) + binary_path = dictionary.temp_directory.joinpath("L" + binary_ext) - words_file_path = os.path.join(dictionary.temp_directory, "words.txt") + words_file_path = dictionary.temp_directory.joinpath("words.txt") - log_path = os.path.join(dictionary.temp_directory, os.path.basename(binary_path) + ".log") + log_path = dictionary.temp_directory.joinpath(binary_path.name + ".log") with mfa_open(log_path, "w") as log_file: log_file.write(f"Phone isymbols: {self.phone_symbol_table_path}\n") log_file.write(f"Word osymbols: {words_file_path}\n") @@ -1360,7 +1360,7 @@ def _write_fst_binary( "--keep_state_numbering=true", text_path, ] - log_file.write(f"{' '.join(com)}\n") + log_file.write(f"{' '.join(map(str,com))}\n") log_file.flush() compile_proc = subprocess.Popen( com, @@ -1373,7 +1373,7 @@ def _write_fst_binary( self.phone_disambig_path, word_disambig_path, ] - log_file.write(f"{' '.join(com)}\n") + log_file.write(f"{' '.join(map(str,com))}\n") log_file.flush() selfloop_proc = subprocess.Popen( com, @@ -1387,7 +1387,7 @@ def _write_fst_binary( "-", binary_path, ] - log_file.write(f"{' '.join(com)}\n") + log_file.write(f"{' '.join(map(str,com))}\n") log_file.flush() arc_sort_proc = subprocess.Popen( com, @@ -1401,7 +1401,7 @@ def _write_fst_binary( "-", binary_path, ] - log_file.write(f"{' '.join(com)}\n") + log_file.write(f"{' '.join(map(str,com))}\n") log_file.flush() arc_sort_proc = subprocess.Popen( com, @@ -1753,4 +1753,4 @@ def identifier(self) -> str: @property def output_directory(self) -> str: """Root temporary directory to store all dictionary information""" - return os.path.join(GLOBAL_CONFIG.temporary_directory, self.identifier) + return GLOBAL_CONFIG.current_profile.temporary_directory.joinpath(self.identifier) diff --git a/montreal_forced_aligner/exceptions.py b/montreal_forced_aligner/exceptions.py index b3b4fb7c..dce0a5d0 100644 --- a/montreal_forced_aligner/exceptions.py +++ b/montreal_forced_aligner/exceptions.py @@ -10,6 +10,7 @@ import re import sys import typing +from pathlib import Path from typing import TYPE_CHECKING, Collection, Dict, List, Optional import requests.structures @@ -192,11 +193,11 @@ class ModelLoadError(ModelError): Parameters ---------- - path: str + path: :class:`~pathlib.Path` Path of the model archive """ - def __init__(self, path: str): + def __init__(self, path: typing.Union[str, Path]): super().__init__("") self.message_lines = [ f"The archive {self.printer.error_text(path)} could not be parsed as an MFA model." @@ -209,11 +210,11 @@ class ModelSaveError(ModelError): Parameters ---------- - path: str + path: :class:`~pathlib.Path` Path of the model archive """ - def __init__(self, path: str): + def __init__(self, path: Path): super().__init__("") self.message_lines = [ f"The archive {self.printer.error_text(path)} already exists.", @@ -283,11 +284,11 @@ class DictionaryPathError(DictionaryError): Parameters ---------- - input_path: str + input_path: :class:`~pathlib.Path` Path of the pronunciation dictionary """ - def __init__(self, input_path: str): + def __init__(self, input_path: Path): super().__init__("") self.message_lines = [ f"The specified path for the dictionary ({self.printer.error_text(input_path)}) was not found." @@ -300,11 +301,11 @@ class DictionaryFileError(DictionaryError): Parameters ---------- - input_path: str + input_path: :class:`~pathlib.Path` Path of the pronunciation dictionary """ - def __init__(self, input_path: str): + def __init__(self, input_path: Path): super().__init__("") self.message_lines = [ f"The specified path for the dictionary ({self.printer.error_text(input_path)}) is not a file." @@ -481,7 +482,7 @@ class AlignmentExportError(AlignmentError): """ - def __init__(self, path: str, error_lines: List[str]): + def __init__(self, path: Path, error_lines: List[str]): MFAError.__init__(self, f"Error was encountered in exporting {path}:") self.path = path self.message_lines.append("") @@ -567,11 +568,11 @@ class FileArgumentNotFoundError(ArgumentError): Parameters ---------- - path: str + path: :class:`~pathlib.Path` Path not found """ - def __init__(self, path): + def __init__(self, path: Path): super().__init__("") self.message_lines = [f'Could not find "{self.printer.error_text(path)}".'] @@ -806,11 +807,11 @@ class LanguageModelNotFoundError(LMError): Parameters ---------- - path: str + path: :class:`~pathlib.Path` Path to missing language model """ - def __init__(self, path: str): + def __init__(self, path: Path): super().__init__(f"Could not find a suitable language model: {path}") @@ -867,7 +868,7 @@ class KaldiProcessingError(MFAError): Overall log file to find more information """ - def __init__(self, error_logs: List[str], log_file: Optional[str] = None): + def __init__(self, error_logs: List[typing.Union[Path, str]], log_file: Optional[Path] = None): super().__init__( f"There were {len(error_logs)} job(s) with errors when running Kaldi binaries." ) diff --git a/montreal_forced_aligner/g2p/generator.py b/montreal_forced_aligner/g2p/generator.py index 66f3d865..1114eb51 100644 --- a/montreal_forced_aligner/g2p/generator.py +++ b/montreal_forced_aligner/g2p/generator.py @@ -9,6 +9,7 @@ import queue import statistics import time +from pathlib import Path from typing import TYPE_CHECKING, Dict, List, Optional, Set, Tuple, Union import pynini @@ -353,7 +354,7 @@ class PyniniGenerator(G2PTopLevelMixin): G2P model """ - def __init__(self, g2p_model_path: str, strict_graphemes: bool = False, **kwargs): + def __init__(self, g2p_model_path: Path = None, strict_graphemes: bool = False, **kwargs): self.strict_graphemes = strict_graphemes super().__init__(**kwargs) self.g2p_model = G2PModel( @@ -518,14 +519,14 @@ def data_source_identifier(self) -> str: return "validation" @property - def data_directory(self) -> str: + def data_directory(self) -> Path: """Data directory""" return self.working_directory @property - def evaluation_csv_path(self) -> str: + def evaluation_csv_path(self) -> Path: """Path to working directory's CSV file""" - return os.path.join(self.working_directory, "pronunciation_evaluation.csv") + return self.working_directory.joinpath("pronunciation_evaluation.csv") def setup(self) -> None: """Set up the G2P validator""" @@ -588,6 +589,8 @@ def compute_validation_errors( ) continue hyp = hypothesis_values[word] + if not isinstance(hyp, list): + hyp = [hyp] for h in hyp: if h in gold_pronunciations: correct += 1 @@ -676,7 +679,7 @@ class PyniniWordListGenerator(PyniniValidator, DatabaseMixin): Parameters ---------- - word_list_path: str + word_list_path: :class:`~pathlib.Path` Path to word list file See Also @@ -692,12 +695,12 @@ class PyniniWordListGenerator(PyniniValidator, DatabaseMixin): Word list to generate pronunciations """ - def __init__(self, word_list_path: str, **kwargs): + def __init__(self, word_list_path: Path, **kwargs): self.word_list_path = word_list_path super().__init__(**kwargs) @property - def data_directory(self) -> str: + def data_directory(self) -> Path: """Data directory""" return self.working_directory diff --git a/montreal_forced_aligner/g2p/mixins.py b/montreal_forced_aligner/g2p/mixins.py index fa6bc721..a5ec43ee 100644 --- a/montreal_forced_aligner/g2p/mixins.py +++ b/montreal_forced_aligner/g2p/mixins.py @@ -89,7 +89,7 @@ def export_pronunciations(self, output_file_path: typing.Union[str, Path]) -> No Parameters ---------- - output_file_path: str + output_file_path: :class:`~pathlib.Path` Path to save """ if isinstance(output_file_path, str): diff --git a/montreal_forced_aligner/g2p/phonetisaurus_trainer.py b/montreal_forced_aligner/g2p/phonetisaurus_trainer.py index 19633c34..c9d34954 100644 --- a/montreal_forced_aligner/g2p/phonetisaurus_trainer.py +++ b/montreal_forced_aligner/g2p/phonetisaurus_trainer.py @@ -1,5 +1,6 @@ from __future__ import annotations +import abc import collections import logging import multiprocessing as mp @@ -7,6 +8,7 @@ import queue import subprocess import time +from pathlib import Path import dataclassy import numpy @@ -47,7 +49,7 @@ class MaximizationArguments: """Arguments for the MaximizationWorker""" db_string: str - far_path: str + far_path: Path penalize_em: bool batch_size: int @@ -57,7 +59,7 @@ class ExpectationArguments: """Arguments for the ExpectationWorker""" db_string: str - far_path: str + far_path: Path batch_size: int @@ -66,8 +68,8 @@ class AlignmentExportArguments: """Arguments for the AlignmentExportWorker""" db_string: str - log_path: str - far_path: str + log_path: Path + far_path: Path penalize: bool @@ -75,9 +77,9 @@ class AlignmentExportArguments: class NgramCountArguments: """Arguments for the NgramCountWorker""" - log_path: str - far_path: str - alignment_symbols_path: str + log_path: Path + far_path: Path + alignment_symbols_path: Path order: int @@ -86,8 +88,8 @@ class AlignmentInitArguments: """Arguments for the alignment initialization worker""" db_string: str - log_path: str - far_path: str + log_path: Path + far_path: Path deletions: bool insertions: bool restrict: bool @@ -142,7 +144,7 @@ def __init__( self.seq_sep = args.seq_sep self.skip = args.skip self.far_path = args.far_path - self.sym_path = self.far_path.replace(".far", ".syms") + self.sym_path = self.far_path.with_suffix(".syms") self.log_path = args.log_path self.db_string = args.db_string self.batch_size = args.batch_size @@ -318,7 +320,7 @@ def run(self) -> None: if data: data = {k: float(v) for k, v in data.items()} self.return_queue.put((self.job_name, data, count)) - symbol_table.write_text(self.far_path.replace(".far", ".syms")) + symbol_table.write_text(self.far_path.with_suffix(".syms")) return except Exception as e: self.stopped.stop() @@ -367,7 +369,7 @@ def run(self) -> None: ).execution_options(logging_token=f"{type(self).__name__}_engine") Session = scoped_session(sessionmaker(bind=engine, autoflush=False, autocommit=False)) far_reader = pywrapfst.FarReader.open(self.far_path) - symbol_table = pynini.SymbolTable.read_text(self.far_path.replace(".far", ".syms")) + symbol_table = pynini.SymbolTable.read_text(self.far_path.with_suffix(".syms")) symbol_mapper = {} data = {} count = 0 @@ -454,7 +456,7 @@ def __init__( def run(self) -> None: """Run the function""" - symbol_table = pynini.SymbolTable.read_text(self.far_path.replace(".far", ".syms")) + symbol_table = pynini.SymbolTable.read_text(self.far_path.with_suffix(".syms")) count = 0 engine = sqlalchemy.create_engine( self.db_string, @@ -481,7 +483,9 @@ def run(self) -> None: weight = pynini.Weight("log", 99) alignment_model[symbol_table.find(m2m.symbol)] = weight far_reader = pywrapfst.FarReader.open(self.far_path) - far_writer = pywrapfst.FarWriter.create(self.far_path + ".temp", arc_type="log") + far_writer = pywrapfst.FarWriter.create( + self.far_path.with_suffix(self.far_path.suffix + ".temp"), arc_type="log" + ) while not far_reader.done(): if self.stopped.stop_check(): break @@ -506,7 +510,7 @@ def run(self) -> None: del far_reader del far_writer os.remove(self.far_path) - os.rename(self.far_path + ".temp", self.far_path) + os.rename(self.far_path.with_suffix(self.far_path.suffix + ".temp"), self.far_path) except Exception as e: self.stopped.stop() self.return_queue.put(e) @@ -543,10 +547,10 @@ def __init__(self, return_queue: mp.Queue, stopped: Stopped, args: AlignmentExpo def run(self) -> None: """Run the function""" - symbol_table = pynini.SymbolTable.read_text(self.far_path.replace(".far", ".syms")) + symbol_table = pynini.SymbolTable.read_text(self.far_path.with_suffix(".syms")) with mfa_open(self.log_path, "w") as log_file: far_reader = pywrapfst.FarReader.open(self.far_path) - one_best_path = self.far_path + ".strings" + one_best_path = self.far_path.with_suffix(".strings") no_alignment_count = 0 total = 0 with mfa_open(one_best_path, "w") as f: @@ -628,8 +632,8 @@ def __init__(self, return_queue: mp.Queue, stopped: Stopped, args: NgramCountArg def run(self) -> None: """Run the function""" with mfa_open(self.log_path, "w") as log_file: - one_best_path = self.far_path + ".strings" - ngram_count_path = self.far_path.replace(".far", ".cnts") + one_best_path = self.far_path.with_suffix(".strings") + ngram_count_path = self.far_path.with_suffix(".cnts") farcompile_proc = subprocess.Popen( [ thirdparty_binary("farcompilestrings"), @@ -652,7 +656,6 @@ def run(self) -> None: ], stderr=log_file, stdin=farcompile_proc.stdout, - # stdout=subprocess.PIPE, env=os.environ, ) ngramcount_proc.communicate() @@ -775,8 +778,8 @@ def initialize_alignments(self) -> None: for i in range(GLOBAL_CONFIG.num_jobs): args = AlignmentInitArguments( self.db_string, - os.path.join(self.working_log_directory, f"alignment_init.{i}.log"), - os.path.join(self.working_directory, f"{i}.far"), + self.working_log_directory.joinpath(f"alignment_init.{i}.log"), + self.working_directory.joinpath(f"{i}.far"), self.deletions, self.insertions, self.restrict_m2m, @@ -904,7 +907,7 @@ def maximization(self, last_iteration=False) -> float: for i in range(GLOBAL_CONFIG.num_jobs): args = MaximizationArguments( self.db_string, - os.path.join(self.working_directory, f"{i}.far"), + self.working_directory.joinpath(f"{i}.far"), self.penalize_em, self.batch_size, ) @@ -958,7 +961,7 @@ def expectation(self) -> None: for i in range(GLOBAL_CONFIG.num_jobs): args = ExpectationArguments( self.db_string, - os.path.join(self.working_directory, f"{i}.far"), + self.working_directory.joinpath(f"{i}.far"), self.batch_size, ) procs.append(ExpectationWorker(i, return_queue, stopped, args)) @@ -1019,13 +1022,13 @@ def train_ngram_model(self) -> None: count_paths = [] for i in range(GLOBAL_CONFIG.num_jobs): args = NgramCountArguments( - os.path.join(self.working_log_directory, f"ngram_count.{i}.log"), - os.path.join(self.working_directory, f"{i}.far"), + self.working_log_directory.joinpath(f"ngram_count.{i}.log"), + self.working_directory.joinpath(f"{i}.far"), self.alignment_symbols_path, self.order, ) procs.append(NgramCountWorker(return_queue, stopped, args)) - count_paths.append(args.far_path.replace(".far", ".cnts")) + count_paths.append(args.far_path.with_suffix(".cnts")) procs[i].start() with tqdm.tqdm( @@ -1056,11 +1059,11 @@ def train_ngram_model(self) -> None: logger.info("Done counting ngrams!") logger.info("Training ngram model...") - with mfa_open(os.path.join(self.working_log_directory, "model.log"), "w") as logf: + with mfa_open(self.working_log_directory.joinpath("model.log"), "w") as logf: ngrammerge_proc = subprocess.Popen( [ thirdparty_binary("ngrammerge"), - f'--ofile={self.ngram_path.replace(".fst", ".cnts")}', + f'--ofile={self.ngram_path.with_suffix(".cnts")}', *count_paths, ], stderr=logf, @@ -1072,7 +1075,7 @@ def train_ngram_model(self) -> None: [ thirdparty_binary("ngrammake"), f"--method={self.smoothing_method}", - self.ngram_path.replace(".fst", ".cnts"), + self.ngram_path.with_suffix(".cnts"), ], stderr=logf, stdout=subprocess.PIPE, @@ -1213,7 +1216,22 @@ def train_alignments(self) -> None: break @property - def data_directory(self) -> str: + @abc.abstractmethod + def working_directory(self) -> Path: + ... + + @property + @abc.abstractmethod + def working_log_directory(self) -> Path: + ... + + @property + @abc.abstractmethod + def db_string(self) -> str: + ... + + @property + def data_directory(self) -> Path: """Data directory for trainer""" return self.working_directory @@ -1226,63 +1244,62 @@ def data_source_identifier(self) -> str: """Dictionary name""" return self._data_source - def export_model(self, output_model_path: str) -> None: + def export_model(self, output_model_path: Path) -> None: """ Export G2P model to specified path Parameters ---------- - output_model_path:str + output_model_path: :class:`~pathlib.Path` Path to export model """ - directory, filename = os.path.split(output_model_path) - basename, _ = os.path.splitext(filename) - models_temp_dir = os.path.join(self.working_directory, "model_archive_temp") - model = G2PModel.empty(basename, root_directory=models_temp_dir) + directory = output_model_path.parent + directory.mkdir(parents=True, exist_ok=True) + models_temp_dir = self.working_directory.joinpath("model_archive_temp") + model = G2PModel.empty(output_model_path.stem, root_directory=models_temp_dir) model.add_meta_file(self) model.add_fst_model(self.working_directory) model.add_sym_path(self.working_directory) if directory: os.makedirs(directory, exist_ok=True) - basename, _ = os.path.splitext(output_model_path) - model.dump(basename) + model.dump(output_model_path) model.clean_up() logger.info(f"Saved model to {output_model_path}") @property - def alignment_model_path(self) -> str: + def alignment_model_path(self) -> Path: """Path to store alignment model FST""" - return os.path.join(self.working_directory, "align.fst") + return self.working_directory.joinpath("align.fst") @property - def ngram_path(self) -> str: + def ngram_path(self) -> Path: """Path to store ngram model""" - return os.path.join(self.working_directory, "ngram.fst") + return self.working_directory.joinpath("ngram.fst") @property - def fst_path(self) -> str: + def fst_path(self) -> Path: """Path to store final trained model""" - return os.path.join(self.working_directory, "model.fst") + return self.working_directory.joinpath("model.fst") @property - def alignment_symbols_path(self) -> str: + def alignment_symbols_path(self) -> Path: """Path to alignment symbol table""" - return os.path.join(self.working_directory, "alignment.syms") + return self.working_directory.joinpath("alignment.syms") @property - def grapheme_symbols_path(self) -> str: + def grapheme_symbols_path(self) -> Path: """Path to final model's grapheme symbol table""" - return os.path.join(self.working_directory, "graphemes.txt") + return self.working_directory.joinpath("graphemes.txt") @property - def phone_symbols_path(self) -> str: + def phone_symbols_path(self) -> Path: """Path to final model's phone symbol table""" - return os.path.join(self.working_directory, "phones.txt") + return self.working_directory.joinpath("phones.txt") @property - def far_path(self) -> str: + def far_path(self) -> Path: """Path to store final aligned FSTs""" - return os.path.join(self.working_directory, "aligned.far") + return self.working_directory.joinpath("aligned.far") def export_alignments(self) -> None: """ @@ -1298,12 +1315,12 @@ def export_alignments(self) -> None: for i in range(GLOBAL_CONFIG.num_jobs): args = AlignmentExportArguments( self.db_string, - os.path.join(self.working_log_directory, f"ngram_count.{i}.log"), - os.path.join(self.working_directory, f"{i}.far"), + self.working_log_directory.joinpath(f"ngram_count.{i}.log"), + self.working_directory.joinpath(f"{i}.far"), self.penalize, ) procs.append(AlignmentExporter(return_queue, stopped, args)) - count_paths.append(args.far_path.replace(".far", ".cnts")) + count_paths.append(args.far_path.with_suffix(".cnts")) procs[i].start() with tqdm.tqdm( @@ -1345,7 +1362,7 @@ def export_alignments(self) -> None: stdout=subprocess.PIPE, ) for j in range(GLOBAL_CONFIG.num_jobs): - text_path = os.path.join(self.working_directory, f"{j}.far.strings") + text_path = self.working_directory.joinpath(f"{j}.far.strings") with mfa_open(text_path, "r") as f: for line in f: symbols_proc.stdin.write(line) @@ -1367,13 +1384,13 @@ def __init__( self, **kwargs, ): - self._data_source = os.path.splitext(os.path.basename(kwargs["dictionary_path"]))[0] + self._data_source = kwargs["dictionary_path"].stem super().__init__(**kwargs) self.ler = None self.wer = None @property - def data_directory(self) -> str: + def data_directory(self) -> Path: """Data directory for trainer""" return self.working_directory @@ -1454,9 +1471,9 @@ def evaluate_g2p_model(self) -> None: """ Validate the G2P model against held out data """ - temp_model_path = os.path.join(self.working_log_directory, "g2p_model.zip") + temp_model_path = self.working_log_directory.joinpath("g2p_model.zip") self.export_model(temp_model_path) - temp_dir = os.path.join(self.working_directory, "validation") + temp_dir = self.working_directory.joinpath("validation") os.makedirs(temp_dir, exist_ok=True) with self.session() as session: validation_set = collections.defaultdict(set) @@ -1474,7 +1491,7 @@ def evaluate_g2p_model(self) -> None: num_pronunciations=self.num_pronunciations, ) output = gen.generate_pronunciations() - with mfa_open(os.path.join(temp_dir, "validation_output.txt"), "w") as f: + with mfa_open(temp_dir.joinpath("validation_output.txt"), "w") as f: for (orthography, pronunciations) in output.items(): if not pronunciations: continue @@ -1485,9 +1502,9 @@ def evaluate_g2p_model(self) -> None: gen.compute_validation_errors(validation_set, output) def compute_initial_ngrams(self) -> None: - word_path = os.path.join(self.working_directory, "words.txt") - word_ngram_path = os.path.join(self.working_directory, "grapheme_ngram.fst") - word_symbols_path = os.path.join(self.working_directory, "grapheme_ngram.syms") + word_path = self.working_directory.joinpath("words.txt") + word_ngram_path = self.working_directory.joinpath("grapheme_ngram.fst") + word_symbols_path = self.working_directory.joinpath("grapheme_ngram.syms") symbols_proc = subprocess.Popen( [ thirdparty_binary("ngramsymbols"), @@ -1559,13 +1576,13 @@ def compute_initial_ngrams(self) -> None: ngrams.add(ngram) print_proc.wait() - with mfa_open(word_ngram_path.replace(".fst", ".ngrams"), "w") as f: + with mfa_open(word_ngram_path.with_suffix(".ngrams"), "w") as f: for ngram in sorted(ngrams): f.write(f"{ngram}\n") - phone_path = os.path.join(self.working_directory, "pronunciations.txt") - phone_ngram_path = os.path.join(self.working_directory, "phone_ngram.fst") - phone_symbols_path = os.path.join(self.working_directory, "phone_ngram.syms") + phone_path = self.working_directory.joinpath("pronunciations.txt") + phone_ngram_path = self.working_directory.joinpath("phone_ngram.fst") + phone_symbols_path = self.working_directory.joinpath("phone_ngram.syms") symbols_proc = subprocess.Popen( [ thirdparty_binary("ngramsymbols"), @@ -1634,7 +1651,7 @@ def compute_initial_ngrams(self) -> None: ngrams.add(ngram) print_proc.wait() - with mfa_open(phone_ngram_path.replace(".fst", ".ngrams"), "w") as f: + with mfa_open(phone_ngram_path.with_suffix(".ngrams"), "w") as f: for ngram in sorted(ngrams): f.write(f"{ngram}\n") @@ -1716,10 +1733,8 @@ def initialize_training(self) -> None: .join(Word.job) .filter(Word2Job.training == True) # noqa ) - with mfa_open( - os.path.join(self.working_directory, "words.txt"), "w" - ) as word_f, mfa_open( - os.path.join(self.working_directory, "pronunciations.txt"), "w" + with mfa_open(self.working_directory.joinpath("words.txt"), "w") as word_f, mfa_open( + self.working_directory.joinpath("pronunciations.txt"), "w" ) as phone_f: for pronunciation, word in query: word = list(word) diff --git a/montreal_forced_aligner/g2p/trainer.py b/montreal_forced_aligner/g2p/trainer.py index a9dd4a66..6a4e9b8c 100644 --- a/montreal_forced_aligner/g2p/trainer.py +++ b/montreal_forced_aligner/g2p/trainer.py @@ -12,6 +12,8 @@ import shutil import subprocess import time +import typing +from pathlib import Path from typing import Any, List, NamedTuple, Set import pynini @@ -46,17 +48,17 @@ class RandomStart(NamedTuple): idx: int seed: int - input_far_path: str - output_far_path: str - cg_path: str - tempdir: str + input_far_path: Path + output_far_path: Path + cg_path: Path + tempdir: Path train_opts: List[str] -def _get_far_labels(far_path: str) -> Set[int]: +def _get_far_labels(far_path: typing.Union[Path, str]) -> Set[int]: """Extracts label set from acceptors in a FAR. Args: - far_path: path to FAR file. + far_path: :class:`~pathlib.Path` to FAR file. Returns: A set of integer labels found in the FAR. """ @@ -106,15 +108,15 @@ def run(self) -> None: try: start = time.time() # Randomize channel model. - rfst_path = os.path.join(args.tempdir, f"random-{args.seed:05d}.fst") - afst_path = os.path.join(args.tempdir, f"aligner-{args.seed:05d}.fst") - likelihood_path = afst_path.replace(".fst", ".like") - if not os.path.exists(afst_path): + rfst_path = args.tempdir.joinpath(f"random-{args.seed:05d}.fst") + afst_path = args.tempdir.joinpath(f"aligner-{args.seed:05d}.fst") + likelihood_path = afst_path.with_suffix(".like") + if not afst_path.exists(): cmd = [ thirdparty_binary("baumwelchrandomize"), f"--seed={args.seed}", - args.cg_path, - rfst_path, + str(args.cg_path), + str(rfst_path), ] subprocess.check_call(cmd, stderr=log_file, env=os.environ) random_end = time.time() @@ -127,10 +129,10 @@ def run(self) -> None: cmd = [ thirdparty_binary("baumwelchtrain"), *args.train_opts, - args.input_far_path, - args.output_far_path, - rfst_path, - afst_path, + str(args.input_far_path), + str(args.output_far_path), + str(rfst_path), + str(afst_path), ] log_file.write(f"{args.seed} train command: {' '.join(cmd)}\n") log_file.flush() @@ -306,39 +308,39 @@ def architecture(self) -> str: return "pynini" @property - def input_far_path(self) -> str: + def input_far_path(self) -> Path: """Path to store grapheme archive""" - return os.path.join(self.working_directory, f"{self.data_source_identifier}.g.far") + return self.working_directory.joinpath(f"{self.data_source_identifier}.g.far") @property - def output_far_path(self) -> str: + def output_far_path(self) -> Path: """Path to store phone archive""" - return os.path.join(self.working_directory, f"{self.data_source_identifier}.p.far") + return self.working_directory.joinpath(f"{self.data_source_identifier}.p.far") @property - def cg_path(self) -> str: + def cg_path(self) -> Path: """Path to covering grammar FST""" - return os.path.join(self.working_directory, f"{self.data_source_identifier}.cg.fst") + return self.working_directory.joinpath(f"{self.data_source_identifier}.cg.fst") @property - def align_path(self) -> str: + def align_path(self) -> Path: """Path to store alignment models""" - return os.path.join(self.working_directory, f"{self.data_source_identifier}.align.fst") + return self.working_directory.joinpath(f"{self.data_source_identifier}.align.fst") @property - def afst_path(self) -> str: + def afst_path(self) -> Path: """Path to store aligned FSTs""" - return os.path.join(self.working_directory, f"{self.data_source_identifier}.afst.far") + return self.working_directory.joinpath(f"{self.data_source_identifier}.afst.far") @property - def input_path(self) -> str: + def input_path(self) -> Path: """Path to temporary file to store grapheme training data""" - return os.path.join(self.working_directory, f"input_{self.data_source_identifier}.txt") + return self.working_directory.joinpath(f"input_{self.data_source_identifier}.txt") @property - def output_path(self) -> str: + def output_path(self) -> Path: """Path to temporary file to store phone training data""" - return os.path.join(self.working_directory, f"output_{self.data_source_identifier}.txt") + return self.working_directory.joinpath(f"output_{self.data_source_identifier}.txt") def generate_model(self) -> None: """ @@ -348,7 +350,7 @@ def generate_model(self) -> None: if os.path.exists(self.fst_path): logger.info("Model building already done, skipping!") return - with mfa_open(os.path.join(self.working_log_directory, "model.log"), "w") as logf: + with mfa_open(self.working_log_directory.joinpath("model.log"), "w") as logf: ngramcount_proc = subprocess.Popen( [ thirdparty_binary("ngramcount"), @@ -401,28 +403,28 @@ def generate_model(self) -> None: sort_proc.communicate() @property - def fst_path(self) -> str: + def fst_path(self) -> Path: """Internal temporary FST file""" if self._fst_path is not None: return self._fst_path - return os.path.join(self.working_directory, f"{self.data_source_identifier}.fst") + return self.working_directory.joinpath(f"{self.data_source_identifier}.fst") @property - def far_path(self) -> str: + def far_path(self) -> Path: """Internal temporary FAR file""" - return os.path.join(self.working_directory, f"{self.data_source_identifier}.far") + return self.working_directory.joinpath(f"{self.data_source_identifier}.far") @property - def encoder_path(self) -> str: + def encoder_path(self) -> Path: """Internal temporary encoder file""" - return os.path.join(self.working_directory, f"{self.data_source_identifier}.enc") + return self.working_directory.joinpath(f"{self.data_source_identifier}.enc") @property - def sym_path(self) -> str: + def sym_path(self) -> Path: """Internal temporary symbol file""" if self._sym_path is not None: return self._sym_path - return os.path.join(self.working_directory, "phones.txt") + return self.working_directory.joinpath("phones.txt") def align_g2p(self) -> None: """Runs the entire alignment regimen.""" @@ -439,7 +441,7 @@ def _lexicon_covering(self, input_path=None, output_path=None) -> None: """Builds covering grammar and lexicon FARs.""" # Sets of labels for the covering grammar. with mfa_open( - os.path.join(self.working_log_directory, "covering_grammar.log"), "w" + self.working_log_directory.joinpath("covering_grammar.log"), "w" ) as log_file: if input_path is None: input_path = self.input_path @@ -458,7 +460,7 @@ def _lexicon_covering(self, input_path=None, output_path=None) -> None: else: com.append("--token_type=utf8") com.extend([input_path, self.input_far_path]) - print(" ".join(com), file=log_file) + print(" ".join(map(str, com)), file=log_file) subprocess.check_call(com, env=os.environ, stderr=log_file, stdout=log_file) com = [ thirdparty_binary("farcompilestrings"), @@ -468,7 +470,7 @@ def _lexicon_covering(self, input_path=None, output_path=None) -> None: output_path, self.output_far_path, ] - print(" ".join(com), file=log_file) + print(" ".join(map(str, com)), file=log_file) subprocess.check_call(com, env=os.environ, stderr=log_file, stdout=log_file) ilabels = _get_far_labels(self.input_far_path) print(ilabels, file=log_file) @@ -545,7 +547,7 @@ def _alignments(self) -> None: return_queue = mp.Queue() procs = [] for i in range(GLOBAL_CONFIG.num_jobs): - log_path = os.path.join(self.working_log_directory, f"baumwelch.{i}.log") + log_path = self.working_log_directory.joinpath(f"baumwelch.{i}.log") p = RandomStartWorker( i, job_queue, @@ -560,6 +562,7 @@ def _alignments(self) -> None: try: result = return_queue.get(timeout=1) if isinstance(result, Exception): + error_dict[getattr(result, "job_name", 0)] = result continue if stopped.stop_check(): @@ -595,6 +598,7 @@ def _alignments(self) -> None: cmd.append(self.output_far_path) cmd.append(self.align_path) cmd.append(self.afst_path) + cmd = [str(x) for x in cmd] logger.debug(f"Subprocess call: {cmd}") subprocess.check_call(cmd, env=os.environ) logger.info("Completed computing alignments!") @@ -726,7 +730,7 @@ def initialize_training(self) -> None: } if GLOBAL_CONFIG.debug: with mfa_open( - os.path.join(self.working_directory, "validation_set.txt"), + self.working_directory.joinpath("validation_set.txt"), "w", encoding="utf8", ) as f: @@ -771,32 +775,32 @@ def clean_up(self) -> None: if GLOBAL_CONFIG.debug: return for name in os.listdir(self.working_directory): - path = os.path.join(self.working_directory, name) + path = self.working_directory.joinpath(name) if os.path.isdir(path): shutil.rmtree(path, ignore_errors=True) elif not name.endswith(".log"): os.remove(path) - def export_model(self, output_model_path: str) -> None: + def export_model(self, output_model_path: Path) -> None: """ Export G2P model to specified path Parameters ---------- - output_model_path:str + output_model_path: :class:`~pathlib.Path` Path to export model """ - directory, filename = os.path.split(output_model_path) - basename, _ = os.path.splitext(filename) - models_temp_dir = os.path.join(self.working_directory, "model_archive_temp") - model = G2PModel.empty(basename, root_directory=models_temp_dir) + directory = output_model_path.parent + directory.mkdir(parents=True, exist_ok=True) + + models_temp_dir = self.working_directory.joinpath("model_archive_temp") + model = G2PModel.empty(output_model_path.stem, root_directory=models_temp_dir) model.add_meta_file(self) model.add_fst_model(self.working_directory) model.add_sym_path(self.working_directory) if directory: os.makedirs(directory, exist_ok=True) - basename, _ = os.path.splitext(output_model_path) - model.dump(basename) + model.dump(output_model_path) model.clean_up() # self.clean_up() logger.info(f"Saved model to {output_model_path}") @@ -823,9 +827,9 @@ def train(self) -> None: def finalize_training(self) -> None: """Finalize training""" - shutil.copyfile(self.fst_path, os.path.join(self.working_directory, "model.fst")) + shutil.copyfile(self.fst_path, self.working_directory.joinpath("model.fst")) shutil.copyfile( - self.phone_symbol_table_path, os.path.join(self.working_directory, "phones.txt") + self.phone_symbol_table_path, self.working_directory.joinpath("phones.txt") ) if self.evaluation_mode: self.evaluate_g2p_model() @@ -834,7 +838,7 @@ def evaluate_g2p_model(self) -> None: """ Validate the G2P model against held out data """ - temp_model_path = os.path.join(self.working_log_directory, "g2p_model.zip") + temp_model_path = self.working_log_directory.joinpath("g2p_model.zip") self.export_model(temp_model_path) gen = PyniniValidator( diff --git a/montreal_forced_aligner/helper.py b/montreal_forced_aligner/helper.py index b645e570..54f96c1c 100644 --- a/montreal_forced_aligner/helper.py +++ b/montreal_forced_aligner/helper.py @@ -75,7 +75,7 @@ def load_configuration(config_path: typing.Union[str, Path]) -> Dict[str, Any]: Parameters ---------- - config_path: str or Path + config_path: :class:`~pathlib.Path` Path to yaml or json configuration file Returns @@ -88,7 +88,7 @@ def load_configuration(config_path: typing.Union[str, Path]) -> Dict[str, Any]: config_path = Path(config_path) with mfa_open(config_path, "r") as f: if config_path.suffix == ".yaml": - data = yaml.load(f, Loader=yaml.SafeLoader) + data = yaml.load(f, Loader=yaml.Loader) elif config_path.suffix == ".json": data = json.load(f) if not data: @@ -308,7 +308,7 @@ def error_text(self, text: Any) -> str: str Highlighted text """ - return self.colorize(text, "red") + return self.colorize(str(text), "red") def emphasized_text(self, text: Any) -> str: """ @@ -324,7 +324,7 @@ def emphasized_text(self, text: Any) -> str: str Highlighted text """ - return self.colorize(text, "bright") + return self.colorize(str(text), "bright") def pass_text(self, text: Any) -> str: """ @@ -340,7 +340,7 @@ def pass_text(self, text: Any) -> str: str Highlighted text """ - return self.colorize(text, "green") + return self.colorize(str(text), "green") def warning_text(self, text: Any) -> str: """ @@ -356,7 +356,7 @@ def warning_text(self, text: Any) -> str: str Highlighted text """ - return self.colorize(text, "yellow") + return self.colorize(str(text), "yellow") @property def indent_string(self) -> str: @@ -420,7 +420,7 @@ def format_info_lines(self, lines: Union[list[str], str]) -> List[str]: for i, line in enumerate(lines): lines[i] = ansiwrap.fill( - line, + str(line), initial_indent=self.indent_string, subsequent_indent=" " * self.indent_size * (self.indent_level + 1), width=shutil.get_terminal_size().columns, @@ -705,7 +705,7 @@ def load_scp_safe(string: str) -> str: return string.replace("_MFASPACE_", " ") -def output_mapping(mapping: Dict[str, Any], path: str, skip_safe: bool = False) -> None: +def output_mapping(mapping: Dict[str, Any], path: Path, skip_safe: bool = False) -> None: """ Helper function to save mapping information (i.e., utt2spk) in Kaldi scp format @@ -716,7 +716,7 @@ def output_mapping(mapping: Dict[str, Any], path: str, skip_safe: bool = False) ---------- mapping: dict[str, Any] Mapping to output - path: str + path: :class:`~pathlib.Path` Path to save mapping skip_safe: bool, optional Flag for whether to skip over making a string safe @@ -733,7 +733,7 @@ def output_mapping(mapping: Dict[str, Any], path: str, skip_safe: bool = False) f.write(f"{make_scp_safe(k)} {v}\n") -def load_scp(path: str, data_type: Optional[Type] = str) -> Dict[str, Any]: +def load_scp(path: Path, data_type: Optional[Type] = str) -> Dict[str, Any]: """ Load a Kaldi script file (.scp) @@ -750,7 +750,7 @@ def load_scp(path: str, data_type: Optional[Type] = str) -> Dict[str, Any]: Parameters ---------- - path : str + path : :class:`~pathlib.Path` Path to Kaldi script file data_type : type Type to coerce the data to diff --git a/montreal_forced_aligner/ivector/multiprocessing.py b/montreal_forced_aligner/ivector/multiprocessing.py index d51b3dee..cca229fa 100644 --- a/montreal_forced_aligner/ivector/multiprocessing.py +++ b/montreal_forced_aligner/ivector/multiprocessing.py @@ -5,6 +5,7 @@ import re import subprocess import typing +from pathlib import Path from sqlalchemy.orm import Session, joinedload @@ -31,8 +32,8 @@ class GmmGselectArguments(MfaArguments): feature_options: MetaDict ivector_options: MetaDict - dubm_model: str - gselect_path: str + dubm_model: Path + gselect_path: Path class AccGlobalStatsArguments(MfaArguments): @@ -40,9 +41,9 @@ class AccGlobalStatsArguments(MfaArguments): feature_options: MetaDict ivector_options: MetaDict - gselect_path: str - acc_path: str - dubm_model: str + gselect_path: Path + acc_path: Path + dubm_model: Path class GaussToPostArguments(MfaArguments): @@ -50,8 +51,8 @@ class GaussToPostArguments(MfaArguments): feature_options: MetaDict ivector_options: MetaDict - post_path: str - dubm_model: str + post_path: Path + dubm_model: Path class AccIvectorStatsArguments(MfaArguments): @@ -59,9 +60,9 @@ class AccIvectorStatsArguments(MfaArguments): feature_options: MetaDict ivector_options: MetaDict - ie_path: str - post_path: str - acc_path: str + ie_path: Path + post_path: Path + acc_path: Path class GmmGselectFunction(KaldiFunction): diff --git a/montreal_forced_aligner/ivector/trainer.py b/montreal_forced_aligner/ivector/trainer.py index 3ca28d3f..baa14bcc 100644 --- a/montreal_forced_aligner/ivector/trainer.py +++ b/montreal_forced_aligner/ivector/trainer.py @@ -8,6 +8,7 @@ import subprocess import time import typing +from pathlib import Path from typing import Any, Dict, List, Optional, Tuple import tqdm @@ -78,7 +79,7 @@ def compute_calculated_properties(self) -> None: """Not implemented""" pass - def export_model(self, output_model_path: str) -> None: + def export_model(self, output_model_path: Path) -> None: """ Output IvectorExtractor model @@ -87,9 +88,11 @@ def export_model(self, output_model_path: str) -> None: output_model_path : str Path to save ivector extractor model """ - directory, filename = os.path.split(output_model_path) - basename, _ = os.path.splitext(filename) - ivector_extractor = IvectorExtractorModel.empty(basename, self.working_log_directory) + directory = output_model_path.parent + + ivector_extractor = IvectorExtractorModel.empty( + output_model_path.stem, self.working_log_directory + ) ivector_extractor.add_meta_file(self) ivector_extractor.add_model(self.working_directory) if directory: @@ -180,7 +183,7 @@ def gmm_gselect_arguments(self) -> List[GmmGselectArguments]: GmmGselectArguments( j.id, getattr(self, "db_string", ""), - os.path.join(self.working_log_directory, f"gmm_gselect.{j.id}.log"), + self.working_log_directory.joinpath(f"gmm_gselect.{j.id}.log"), self.feature_options, self.dubm_options, self.model_path, @@ -247,7 +250,7 @@ def gmm_gselect(self) -> None: def _trainer_initialization(self, initial_alignment_directory: Optional[str] = None) -> None: """DUBM training initialization""" - log_path = os.path.join(self.working_log_directory, "gmm_init.log") + log_path = self.working_log_directory.joinpath("gmm_init.log") with self.session() as session, mfa_open(log_path, "w") as log_file: alignment_workflow: CorpusWorkflow = ( session.query(CorpusWorkflow) @@ -278,7 +281,7 @@ def _trainer_initialization(self, initial_alignment_directory: Optional[str] = N feature_string = job.construct_online_feature_proc_string() feature_string = feature_string.replace(f".{job.id}.scp", ".scp") feature_string = feature_string.replace( - job.corpus.current_subset_directory, job.corpus.data_directory + str(job.corpus.current_subset_directory), str(job.corpus.data_directory) ) gmm_init_proc = subprocess.Popen( [ @@ -332,7 +335,7 @@ def acc_global_stats(self) -> None: opt = "--remove-low-count-gaussians=false" else: opt = f"--remove-low-count-gaussians={self.remove_low_count_gaussians}" - log_path = os.path.join(self.working_log_directory, f"update.{self.iteration}.log") + log_path = self.working_log_directory.joinpath(f"update.{self.iteration}.log") with mfa_open(log_path, "w") as log_file: acc_files = [] for j in arguments: @@ -365,7 +368,7 @@ def acc_global_stats(self) -> None: @property def exported_model_path(self) -> str: """Temporary model path to save intermediate model""" - return os.path.join(self.working_log_directory, "dubm_model.zip") + return self.working_log_directory.joinpath("dubm_model.zip") def train_iteration(self) -> None: """ @@ -377,9 +380,9 @@ def train_iteration(self) -> None: def finalize_training(self) -> None: """Finalize DUBM training""" - final_dubm_path = os.path.join(self.working_directory, "final.dubm") + final_dubm_path = self.working_directory.joinpath("final.dubm") shutil.copy( - os.path.join(self.working_directory, f"{self.num_iterations+1}.dubm"), + self.working_directory.joinpath(f"{self.num_iterations+1}.dubm"), final_dubm_path, ) # Update VAD with dubm likelihoods @@ -393,15 +396,15 @@ def finalize_training(self) -> None: def model_path(self) -> str: """Current iteration's DUBM model path""" if self.training_complete: - return os.path.join(self.working_directory, "final.dubm") - return os.path.join(self.working_directory, f"{self.iteration}.dubm") + return self.working_directory.joinpath("final.dubm") + return self.working_directory.joinpath(f"{self.iteration}.dubm") @property def next_model_path(self) -> str: """Next iteration's DUBM model path""" if self.training_complete: - return os.path.join(self.working_directory, "final.dubm") - return os.path.join(self.working_directory, f"{self.iteration + 1}.dubm") + return self.working_directory.joinpath("final.dubm") + return self.working_directory.joinpath(f"{self.iteration + 1}.dubm") class IvectorTrainer(IvectorModelTrainingMixin, IvectorConfigMixin): @@ -436,7 +439,7 @@ def __init__( @property def exported_model_path(self) -> str: """Temporary directory path that trainer will save ivector extractor model""" - return os.path.join(self.working_log_directory, "ivector_model.zip") + return self.working_log_directory.joinpath("ivector_model.zip") def acc_ivector_stats_arguments(self) -> List[AccIvectorStatsArguments]: """ @@ -469,11 +472,11 @@ def _trainer_initialization(self) -> None: """Ivector extractor training initialization""" self.iteration = 1 # Initialize job_name-vector extractor - log_directory = os.path.join(self.working_directory, "log") + log_directory = self.working_directory.joinpath("log") log_path = os.path.join(log_directory, "init.log") - diag_ubm_path = os.path.join(self.working_directory, "final.dubm") + diag_ubm_path = self.working_directory.joinpath("final.dubm") - full_ubm_path = os.path.join(self.working_directory, "final.ubm") + full_ubm_path = self.working_directory.joinpath("final.ubm") if not os.path.exists(self.ie_path): with mfa_open(log_path, "w") as log_file: subprocess.check_call( @@ -510,7 +513,7 @@ def gauss_to_post_arguments(self) -> List[GaussToPostArguments]: GaussToPostArguments( j.id, getattr(self, "db_string", ""), - os.path.join(self.working_log_directory, f"gauss_to_post.{j.id}.log"), + self.working_log_directory.joinpath(f"gauss_to_post.{j.id}.log"), self.feature_options, self.ivector_options, j.construct_path(self.working_directory, "post", "ark"), @@ -572,20 +575,20 @@ def meta(self) -> MetaDict: def ie_path(self) -> str: """Current ivector extractor model path""" if self.training_complete: - return os.path.join(self.working_directory, "final.ie") - return os.path.join(self.working_directory, f"{self.iteration}.ie") + return self.working_directory.joinpath("final.ie") + return self.working_directory.joinpath(f"{self.iteration}.ie") @property def next_ie_path(self) -> str: """Next iteration's ivector extractor model path""" if self.training_complete: - return os.path.join(self.working_directory, "final.ie") - return os.path.join(self.working_directory, f"{self.iteration + 1}.ie") + return self.working_directory.joinpath("final.ie") + return self.working_directory.joinpath(f"{self.iteration + 1}.ie") @property def dubm_path(self) -> str: """DUBM model path""" - return os.path.join(self.working_directory, "final.dubm") + return self.working_directory.joinpath("final.dubm") def acc_ivector_stats(self) -> None: """ @@ -615,8 +618,8 @@ def acc_ivector_stats(self) -> None: logger.debug(f"Accumulating stats took {time.time() - begin:.3f} seconds") - log_path = os.path.join(self.working_log_directory, f"sum_acc.{self.iteration}.log") - acc_path = os.path.join(self.working_directory, f"acc.{self.iteration}") + log_path = self.working_log_directory.joinpath(f"sum_acc.{self.iteration}.log") + acc_path = self.working_directory.joinpath(f"acc.{self.iteration}") with mfa_open(log_path, "w") as log_file: accinits = [] for j in arguments: @@ -635,7 +638,7 @@ def acc_ivector_stats(self) -> None: if os.path.exists(p): os.remove(p) # Est extractor - log_path = os.path.join(self.working_log_directory, f"update.{self.iteration}.log") + log_path = self.working_log_directory.joinpath(f"update.{self.iteration}.log") with mfa_open(log_path, "w") as log_file: extractor_est_proc = subprocess.Popen( [ @@ -643,7 +646,7 @@ def acc_ivector_stats(self) -> None: f"--num-threads={len(self.jobs)}", f"--gaussian-min-count={self.gaussian_min_count}", self.ie_path, - os.path.join(self.working_directory, f"acc.{self.iteration}"), + self.working_directory.joinpath(f"acc.{self.iteration}"), self.next_ie_path, ], stderr=subprocess.PIPE, @@ -686,8 +689,8 @@ def finalize_training(self) -> None: """ # Rename to final shutil.copy( - os.path.join(self.working_directory, f"{self.num_iterations}.ie"), - os.path.join(self.working_directory, "final.ie"), + self.working_directory.joinpath(f"{self.num_iterations}.ie"), + self.working_directory.joinpath("final.ie"), ) self.export_model(self.exported_model_path) wf = self.worker.current_workflow @@ -713,8 +716,8 @@ def _trainer_initialization(self) -> None: def compute_lda(self): - lda_path = os.path.join(self.working_directory, "ivector_lda.mat") - log_path = os.path.join(self.working_log_directory, "lda.log") + lda_path = self.working_directory.joinpath("ivector_lda.mat") + log_path = self.working_log_directory.joinpath("lda.log") utt2spk_path = os.path.join(self.corpus_output_directory, "utt2spk.scp") with tqdm.tqdm( total=self.worker.num_utterances, disable=GLOBAL_CONFIG.quiet @@ -757,12 +760,12 @@ def train(self): self.worker.compute_plda() self.worker.compute_speaker_ivectors() os.rename( - os.path.join(self.working_directory, "current_speaker_ivectors.ark"), - os.path.join(self.working_directory, "speaker_ivectors.ark"), + self.working_directory.joinpath("current_speaker_ivectors.ark"), + self.working_directory.joinpath("speaker_ivectors.ark"), ) os.rename( - os.path.join(self.working_directory, "current_num_utts.ark"), - os.path.join(self.working_directory, "num_utts.ark"), + self.working_directory.joinpath("current_num_utts.ark"), + self.working_directory.joinpath("num_utts.ark"), ) @@ -900,7 +903,7 @@ def train(self) -> None: previous = trainer logger.info(f"Completed training in {time.time()-begin} seconds!") - def export_model(self, output_model_path: str) -> None: + def export_model(self, output_model_path: Path) -> None: """ Export an ivector extractor model to the specified path @@ -916,7 +919,7 @@ def export_model(self, output_model_path: str) -> None: @classmethod def parse_parameters( cls, - config_path: Optional[str] = None, + config_path: Optional[Path] = None, args: Optional[Dict[str, Any]] = None, unknown_args: Optional[typing.Iterable[str]] = None, ) -> MetaDict: @@ -925,7 +928,7 @@ def parse_parameters( Parameters ---------- - config_path: str, optional + config_path: :class:`~pathlib.Path`, optional Path to yaml configuration file args: dict[str, Any] Parsed arguments diff --git a/montreal_forced_aligner/language_modeling/multiprocessing.py b/montreal_forced_aligner/language_modeling/multiprocessing.py index 3b0caa73..d7cd2b22 100644 --- a/montreal_forced_aligner/language_modeling/multiprocessing.py +++ b/montreal_forced_aligner/language_modeling/multiprocessing.py @@ -4,6 +4,7 @@ import os import subprocess import typing +from pathlib import Path import sqlalchemy from sqlalchemy.orm import Session, joinedload, subqueryload @@ -38,27 +39,21 @@ class TrainSpeakerLmArguments(MfaArguments): Integer ID of the job db_string: str String for database connections - log_path: str + log_path: :class:`~pathlib.Path` Path to save logging information during the run - model_directory: str - Path to model directory - word_symbols_paths: dict[int, str] - Per dictionary words symbol table paths - speaker_mapping: dict[int, str] - Mapping of dictionaries to speakers - speaker_paths: dict[int, str] - Per speaker output LM paths - oov_word: str - OOV word + model_path: :class:`~pathlib.Path` + Path to model order: int Ngram order of the language models method: str Ngram smoothing method target_num_ngrams: int Target number of ngrams + hclg_options: dict[str, Any] + HCLG creation options """ - model_path: str + model_path: Path order: int method: str target_num_ngrams: int @@ -76,28 +71,18 @@ class TrainLmArguments(MfaArguments): Integer ID of the job db_string: str String for database connections - log_path: str + log_path: :class:`~pathlib.Path` Path to save logging information during the run - model_directory: str - Path to model directory - word_symbols_paths: dict[int, str] - Per dictionary words symbol table paths - speaker_mapping: dict[int, str] - Mapping of dictionaries to speakers - speaker_paths: dict[int, str] - Per speaker output LM paths + symbols_path: :class:`~pathlib.Path` + Words symbol table paths oov_word: str OOV word order: int Ngram order of the language models - method: str - Ngram smoothing method - target_num_ngrams: int - Target number of ngrams """ - working_directory: str - symbols_path: str + working_directory: Path + symbols_path: Path order: int oov_word: str @@ -158,7 +143,7 @@ def _run(self) -> typing.Generator[bool]: "--round_to_int", f"--order={self.order}", "-", - os.path.join(self.working_directory, f"{self.job_name}.cnts"), + self.working_directory.joinpath(f"{self.job_name}.cnts"), ], stderr=log_file, stdin=farcompile_proc.stdout, @@ -234,7 +219,7 @@ def _run(self) -> typing.Generator[bool]: "--round_to_int", f"--order={self.order}", "-", - os.path.join(self.working_directory, f"{self.job_name}.cnts"), + self.working_directory.joinpath(f"{self.job_name}.cnts"), ], stderr=log_file, stdin=farcompile_proc.stdout, @@ -300,7 +285,7 @@ def _run(self) -> typing.Generator[bool]: ) for (speaker_id,) in speakers: - hclg_path = os.path.join(d.temp_directory, f"{speaker_id}.fst") + hclg_path = d.temp_directory.joinpath(f"{speaker_id}.fst") if os.path.exists(hclg_path): continue utterances = ( @@ -308,7 +293,7 @@ def _run(self) -> typing.Generator[bool]: .filter(Utterance.speaker_id == speaker_id) .order_by(Utterance.kaldi_id) ) - mod_path = os.path.join(d.temp_directory, f"g.{speaker_id}.fst") + mod_path = d.temp_directory.joinpath(f"g.{speaker_id}.fst") farcompile_proc = subprocess.Popen( [ thirdparty_binary("farcompilestrings"), @@ -357,18 +342,18 @@ def _run(self) -> typing.Generator[bool]: shrink_proc.wait() context_width = self.hclg_options["context_width"] central_pos = self.hclg_options["central_pos"] - path_template = os.path.join( - d.temp_directory, f"{{file_name}}.{speaker_id}.fst" + lg_path = d.temp_directory.joinpath(f"LG.{speaker_id}.fst") + hclga_path = d.temp_directory.joinpath(f"HCLGa.{speaker_id}.fst") + ilabels_temp = d.temp_directory.joinpath( + f"ilabels_{context_width}_{central_pos}.{speaker_id}" + ) + out_disambig = d.temp_directory.joinpath( + f"disambig_ilabels_{context_width}_{central_pos}.int" ) - lg_path = path_template.format(file_name="LG") - hclga_path = path_template.format(file_name="HCLGa") - clg_path = path_template.format(file_name=f"CLG_{context_width}_{central_pos}") - ilabels_temp = path_template.format( - file_name=f"ilabels_{context_width}_{central_pos}" - ).replace(".fst", "") - out_disambig = path_template.format( - file_name=f"disambig_ilabels_{context_width}_{central_pos}" - ).replace(".fst", ".int") + clg_path = d.temp_directory.joinpath( + f"CLG_{context_width}_{central_pos}.{speaker_id}.fst" + ) + log_file.write("Generating LG.fst...") compose_lg(d.lexicon_disambig_fst_path, mod_path, lg_path, log_file) log_file.write("Generating CLG.fst...") diff --git a/montreal_forced_aligner/language_modeling/trainer.py b/montreal_forced_aligner/language_modeling/trainer.py index b3d8c97f..690b610e 100644 --- a/montreal_forced_aligner/language_modeling/trainer.py +++ b/montreal_forced_aligner/language_modeling/trainer.py @@ -7,6 +7,7 @@ import re import subprocess import typing +from pathlib import Path from queue import Empty import sqlalchemy @@ -82,29 +83,29 @@ def __init__( self.prune_thresh_medium = prune_thresh_medium @property - def mod_path(self) -> str: + def mod_path(self) -> Path: """Internal temporary path to the model file""" - return os.path.join(self.working_directory, f"{self.data_source_identifier}.mod") + return self.working_directory.joinpath(f"{self.data_source_identifier}.mod") @property - def far_path(self) -> str: + def far_path(self) -> Path: """Internal temporary path to the FAR file""" - return os.path.join(self.working_directory, f"{self.data_source_identifier}.far") + return self.working_directory.joinpath(f"{self.data_source_identifier}.far") @property - def large_arpa_path(self) -> str: + def large_arpa_path(self) -> Path: """Internal temporary path to the large arpa file""" - return os.path.join(self.working_directory, f"{self.data_source_identifier}.arpa") + return self.working_directory.joinpath(f"{self.data_source_identifier}.arpa") @property - def medium_arpa_path(self) -> str: + def medium_arpa_path(self) -> Path: """Internal temporary path to the medium arpa file""" - return self.large_arpa_path.replace(".arpa", "_medium.arpa") + return self.working_directory.joinpath(f"{self.data_source_identifier}_medium.arpa") @property - def small_arpa_path(self) -> str: + def small_arpa_path(self) -> Path: """Internal temporary path to the small arpa file""" - return self.large_arpa_path.replace(".arpa", "_small.arpa") + return self.working_directory.joinpath(f"{self.data_source_identifier}_small.arpa") def initialize_training(self) -> None: """Initialize training""" @@ -121,8 +122,8 @@ def finalize_training(self) -> None: def prune_large_language_model(self) -> None: """Prune the large language model into small and medium versions""" logger.info("Pruning large ngram model to medium and small versions...") - small_mod_path = self.mod_path.replace(".mod", "_small.mod") - med_mod_path = self.mod_path.replace(".mod", "_med.mod") + small_mod_path = self.mod_path.with_stem(self.mod_path.stem + "_small") + med_mod_path = self.mod_path.with_stem(self.mod_path.stem + "_med") subprocess.check_call( [ "ngramshrink", @@ -132,7 +133,7 @@ def prune_large_language_model(self) -> None: med_mod_path, ] ) - assert os.path.exists(med_mod_path) + assert med_mod_path.exists() if getattr(self, "sym_path", None): subprocess.check_call( [ @@ -145,7 +146,7 @@ def prune_large_language_model(self) -> None: ) else: subprocess.check_call(["ngramprint", "--ARPA", med_mod_path, self.medium_arpa_path]) - assert os.path.exists(self.medium_arpa_path) + assert self.medium_arpa_path.exists() logger.debug("Finished pruning medium arpa!") subprocess.check_call( @@ -157,7 +158,7 @@ def prune_large_language_model(self) -> None: small_mod_path, ] ) - assert os.path.exists(small_mod_path) + assert small_mod_path.exists() if getattr(self, "sym_path", None): subprocess.check_call( [ @@ -170,31 +171,31 @@ def prune_large_language_model(self) -> None: ) else: subprocess.check_call(["ngramprint", "--ARPA", small_mod_path, self.small_arpa_path]) - assert os.path.exists(self.small_arpa_path) + assert self.small_arpa_path.exists() logger.debug("Finished pruning small arpa!") logger.info("Done pruning!") - def export_model(self, output_model_path: str) -> None: + def export_model(self, output_model_path: Path) -> None: """ Export language model to specified path Parameters ---------- - output_model_path:str + output_model_path: :class:`~pathlib.Path` Path to export model """ - directory, filename = os.path.split(output_model_path) - basename, _ = os.path.splitext(filename) - model_temp_dir = os.path.join(self.working_directory, "model_archiving") + directory = output_model_path.parent + directory.mkdir(parents=True, exist_ok=True) + + model_temp_dir = self.working_directory.joinpath("model_archiving") os.makedirs(model_temp_dir, exist_ok=True) - model = LanguageModel.empty(basename, root_directory=model_temp_dir) + model = LanguageModel.empty(output_model_path.stem, root_directory=model_temp_dir) model.add_meta_file(self) model.add_arpa_file(self.large_arpa_path) model.add_arpa_file(self.medium_arpa_path) model.add_arpa_file(self.small_arpa_path) - basename, _ = os.path.splitext(output_model_path) - model.dump(basename) + model.dump(output_model_path) class LmCorpusTrainerMixin(LmTrainerMixin, TextCorpusMixin): @@ -229,22 +230,22 @@ def __init__(self, **kwargs): @property def sym_path(self) -> str: """Internal path to symbols file""" - return os.path.join(self.working_directory, "lm.sym") + return self.working_directory.joinpath("lm.sym") @property def far_path(self) -> str: """Internal path to FAR file""" - return os.path.join(self.working_directory, "lm.far") + return self.working_directory.joinpath("lm.far") @property def cnts_path(self) -> str: """Internal path to counts file""" - return os.path.join(self.working_directory, "lm.cnts") + return self.working_directory.joinpath("lm.cnts") @property def training_path(self) -> str: """Internal path to training data""" - return os.path.join(self.working_directory, "training.txt") + return self.working_directory.joinpath("training.txt") @property def meta(self) -> MetaDict: @@ -287,10 +288,10 @@ def evaluate(self) -> None: """ Run an evaluation over the training data to generate perplexity score """ - log_path = os.path.join(self.working_log_directory, "evaluate.log") + log_path = self.working_log_directory.joinpath("evaluate.log") - small_mod_path = self.mod_path.replace(".mod", "_small.mod") - med_mod_path = self.mod_path.replace(".mod", "_med.mod") + small_mod_path = self.mod_path.with_stem(self.mod_path.stem + "_small") + med_mod_path = self.mod_path.with_stem(self.mod_path.stem + "_med") with self.session() as session, mfa_open(log_path, "w") as log_file: word_query = session.query(Word.word).filter(Word.word_type == WordType.speech) included_words = set(x[0] for x in word_query) @@ -401,7 +402,7 @@ def evaluate(self) -> None: def train_large_lm(self) -> None: """Train a large language model""" logger.info("Beginning training large ngram model...") - log_path = os.path.join(self.working_log_directory, "lm_training.log") + log_path = self.working_log_directory.joinpath("lm_training.log") return_queue = mp.Queue() stopped = Stopped() error_dict = {} @@ -412,7 +413,7 @@ def train_large_lm(self) -> None: args = TrainLmArguments( j.id, getattr(self, "db_string", ""), - os.path.join(self.working_log_directory, f"ngram_count.{j.id}.log"), + self.working_log_directory.joinpath(f"ngram_count.{j.id}.log"), self.working_directory, self.sym_path, self.order, @@ -422,7 +423,7 @@ def train_large_lm(self) -> None: p = KaldiProcessWorker(j.id, return_queue, function, stopped) procs.append(p) p.start() - count_paths.append(os.path.join(self.working_directory, f"{j.id}.cnts")) + count_paths.append(self.working_directory.joinpath(f"{j.id}.cnts")) with tqdm.tqdm(total=self.num_utterances, disable=GLOBAL_CONFIG.quiet) as pbar: while True: try: @@ -442,7 +443,7 @@ def train_large_lm(self) -> None: pbar.update(1) logger.info("Training model...") with mfa_open(log_path, "w") as log_file: - merged_file = os.path.join(self.working_directory, "merged.cnts") + merged_file = self.working_directory.joinpath("merged.cnts") if len(count_paths) > 1: ngrammerge_proc = subprocess.Popen( [ @@ -525,14 +526,14 @@ class MfaLmArpaTrainer(LmTrainerMixin, TopLevelMfaWorker, DatabaseMixin): For top-level parsing parameters """ - def __init__(self, arpa_path: str, keep_case: bool = False, **kwargs): + def __init__(self, arpa_path: Path, keep_case: bool = False, **kwargs): self.arpa_path = arpa_path self.keep_case = keep_case super().__init__(**kwargs) @property - def working_directory(self) -> str: - return os.path.join(self.output_directory, self.data_source_identifier) + def working_directory(self) -> Path: + return self.output_directory.joinpath(self.data_source_identifier) def setup(self) -> None: """Set up language model training""" @@ -566,7 +567,7 @@ def train(self) -> None: """Convert the arpa model to MFA format""" logger.info("Parsing large ngram model...") - with mfa_open(os.path.join(self.working_log_directory, "read.log"), "w") as log_file: + with mfa_open(self.working_log_directory.joinpath("read.log"), "w") as log_file: subprocess.check_call( ["ngramread", "--ARPA", self.large_arpa_path, self.mod_path], stderr=log_file ) diff --git a/montreal_forced_aligner/models.py b/montreal_forced_aligner/models.py index 7adf1f95..59f3a09f 100644 --- a/montreal_forced_aligner/models.py +++ b/montreal_forced_aligner/models.py @@ -34,6 +34,7 @@ from montreal_forced_aligner.abc import MetaDict from montreal_forced_aligner.dictionary.mixins import DictionaryMixin from montreal_forced_aligner.g2p.trainer import G2PTrainer + from montreal_forced_aligner.tokenization.trainer import TokenizerTrainer else: from dataclassy import dataclass @@ -56,13 +57,13 @@ ] -def guess_model_type(path: str) -> List[str]: +def guess_model_type(path: Path) -> List[str]: """ Guess a model type given a path Parameters ---------- - path: str + path: :class:`~pathlib.Path` Model archive to guess Returns @@ -90,9 +91,9 @@ class Archive(MfaModel): Parameters ---------- - source: Path + source: :class:`~pathlib.Path` Source path - root_directory: Path + root_directory: :class:`~pathlib.Path` Root directory to unpack and store temporary files """ @@ -155,16 +156,16 @@ def parse_old_features(self) -> None: del self._meta["features"][key] if "uses_splices" not in self._meta["features"]: # Backwards compatibility self._meta["features"]["uses_splices"] = os.path.exists( - os.path.join(self.dirname, "lda.mat") + self.dirname.joinpath("lda.mat") ) if "uses_speaker_adaptation" not in self._meta["features"]: self._meta["features"]["uses_speaker_adaptation"] = os.path.exists( - os.path.join(self.dirname, "final.alimdl") + self.dirname.joinpath("final.alimdl") ) def get_subclass_object( self, - ) -> Union[AcousticModel, G2PModel, LanguageModel, IvectorExtractorModel]: + ) -> Union[AcousticModel, G2PModel, LanguageModel, TokenizerModel, IvectorExtractorModel]: """ Instantiate subclass models based on files contained in the archive @@ -178,15 +179,18 @@ def get_subclass_object( :class:`~montreal_forced_aligner.exceptions.ModelLoadError` If the model type cannot be determined """ - for f in os.listdir(self.dirname): - if f == "tree": - return AcousticModel(self.dirname, self.root_directory) - if f in {"phones.sym", "phones.txt"}: - return G2PModel(self.dirname, self.root_directory) - if f.endswith(".arpa"): - return LanguageModel(self.dirname, self.root_directory) - if f == "final.ie": - return IvectorExtractorModel(self.dirname, self.root_directory) + files = [x.name for x in self.dirname.iterdir()] + + if "tree" in files: + return AcousticModel(self.dirname, self.root_directory) + if "phones.sym" in files or "phones.txt" in files: + return G2PModel(self.dirname, self.root_directory) + if any(f.endswith(".arpa") for f in files): + return LanguageModel(self.dirname, self.root_directory) + if "final.ie" in files: + return IvectorExtractorModel(self.dirname, self.root_directory) + if "tokenizer.fst" in files: + return TokenizerModel(self.dirname, self.root_directory) raise ModelLoadError(self.source) @classmethod @@ -196,7 +200,7 @@ def valid_extension(cls, filename: Path) -> bool: Parameters ---------- - filename: Path + filename: :class:`~pathlib.Path` File name to check Returns @@ -217,7 +221,7 @@ def generate_path( Parameters ---------- - root: Path + root: :class:`~pathlib.Path` Root directory for the full path name: str Name of the model @@ -230,7 +234,6 @@ def generate_path( Full path in the root directory for the model """ for ext in cls.extensions: - path = os.path.join(root, name + ext) path = root.joinpath(name + ext) if path.exists() or not enforce_existence: return path @@ -250,14 +253,14 @@ def meta(self) -> dict: Get the meta data associated with the model """ if not self._meta: - meta_path = os.path.join(self.dirname, "meta.json") + meta_path = self.dirname.joinpath("meta.json") format = "json" if not os.path.exists(meta_path): - meta_path = os.path.join(self.dirname, "meta.yaml") + meta_path = self.dirname.joinpath("meta.yaml") format = "yaml" with mfa_open(meta_path, "r") as f: if format == "yaml": - self._meta = yaml.safe_load(f) + self._meta = yaml.load(f, Loader=yaml.Loader) else: self._meta = json.load(f) self.parse_old_features() @@ -272,13 +275,15 @@ def add_meta_file(self, trainer: ModelExporterMixin) -> None: trainer: :class:`~montreal_forced_aligner.abc.ModelExporterMixin` The trainer to construct the metadata from """ - with mfa_open(os.path.join(self.dirname, "meta.json"), "w") as f: + with mfa_open(self.dirname.joinpath("meta.json"), "w") as f: json.dump(trainer.meta, f, ensure_ascii=False) @classmethod def empty( - cls, head: str, root_directory: Optional[str] = None - ) -> Union[Archive, IvectorExtractorModel, AcousticModel, G2PModel, LanguageModel]: + cls, head: str, root_directory: Optional[typing.Union[str, Path]] = None + ) -> Union[ + Archive, IvectorExtractorModel, AcousticModel, G2PModel, TokenizerModel, LanguageModel + ]: """ Initialize an archive using an empty directory @@ -291,18 +296,17 @@ def empty( Returns ------- - :class:`~montreal_forced_aligner.models.Archive`, :class:`~montreal_forced_aligner.models.AcousticModel`, :class:`~montreal_forced_aligner.models.G2PModel`, :class:`~montreal_forced_aligner.models.LanguageModel`, or :class:`~montreal_forced_aligner.models.IvectorExtractorModel` + :class:`~montreal_forced_aligner.models.Archive`, :class:`~montreal_forced_aligner.models.AcousticModel`, :class:`~montreal_forced_aligner.models.G2PModel`, :class:`~montreal_forced_aligner.models.LanguageModel`, :class:`~montreal_forced_aligner.models.TokenizerModel`, or :class:`~montreal_forced_aligner.models.IvectorExtractorModel` Model constructed from the empty directory """ from .config import get_temporary_directory if root_directory is None: - root_directory = os.path.join(get_temporary_directory(), "temp_models", cls.model_type) + root_directory = get_temporary_directory().joinpath("temp_models", cls.model_type) - os.makedirs(root_directory, exist_ok=True) - source = os.path.join(root_directory, head) - os.makedirs(source, exist_ok=True) - return cls(source) + source = root_directory.joinpath(head) + source.mkdir(parents=True, exist_ok=True) + return cls(source, root_directory) def add(self, source: str): """ @@ -323,13 +327,13 @@ def clean_up(self) -> None: """Remove temporary directory""" rmtree(self.dirname) - def dump(self, path: str, archive_fmt: str = FORMAT) -> str: + def dump(self, path: Path, archive_fmt: str = FORMAT) -> str: """ Write archive to disk, and return the name of final archive Parameters ---------- - path: str + path: :class:`~pathlib.Path` Path to write to archive_fmt: str, optional Archive extension to use, defaults to ".zip" @@ -365,7 +369,11 @@ class AcousticModel(Archive): model_type = "acoustic" - def __init__(self, source: str, root_directory: Optional[str] = None): + def __init__( + self, + source: typing.Union[str, Path], + root_directory: Optional[typing.Union[str, Path]] = None, + ): if source in AcousticModel.get_available_models(): source = AcousticModel.get_pretrained_path(source) @@ -380,7 +388,7 @@ def add_meta_file(self, trainer: ModelExporterMixin) -> None: trainer: :class:`~montreal_forced_aligner.abc.ModelExporterMixin` Trainer to supply metadata information about the acoustic model """ - with mfa_open(os.path.join(self.dirname, "meta.json"), "w") as f: + with mfa_open(self.dirname.joinpath("meta.json"), "w") as f: json.dump(trainer.meta, f, ensure_ascii=False) @property @@ -399,7 +407,7 @@ def parameters(self) -> MetaDict: params["final_silence_correction"] = self.meta.get("final_silence_correction", None) if "other_noise_phone" in self.meta: params["other_noise_phone"] = self.meta["other_noise_phone"] - # rules_path = os.path.join(self.dirname, "rules.yaml") + # rules_path = self.dirname.joinpath("rules.yaml") # if os.path.exists(rules_path): # params["rules_path"] = rules_path if ( @@ -441,10 +449,10 @@ def meta(self) -> MetaDict: "splice_right_context": 3, } if not self._meta: - meta_path = os.path.join(self.dirname, "meta.json") + meta_path = self.dirname.joinpath("meta.json") format = "json" if not os.path.exists(meta_path): - meta_path = os.path.join(self.dirname, "meta.yaml") + meta_path = self.dirname.joinpath("meta.yaml") format = "yaml" if not os.path.exists(meta_path): self._meta = { @@ -455,7 +463,7 @@ def meta(self) -> MetaDict: else: with mfa_open(meta_path, "r") as f: if format == "yaml": - self._meta = yaml.safe_load(f) + self._meta = yaml.load(f, Loader=yaml.Loader) else: self._meta = json.load(f) if self._meta["features"] == "mfcc+deltas": @@ -483,7 +491,7 @@ def meta(self) -> MetaDict: or not self._meta["features"]["uses_speaker_adaptation"] ): self._meta["features"]["uses_speaker_adaptation"] = os.path.exists( - os.path.join(self.dirname, "final.alimdl") + self.dirname.joinpath("final.alimdl") ) if self._meta["version"] in {"0.9.0", "1.0.0"}: self._meta["features"]["uses_speaker_adaptation"] = True @@ -492,7 +500,7 @@ def meta(self) -> MetaDict: or not self._meta["features"]["uses_splices"] ): self._meta["features"]["uses_splices"] = os.path.exists( - os.path.join(self.dirname, "lda.mat") + self.dirname.joinpath("lda.mat") ) if self._meta["features"]["uses_splices"]: self._meta["features"]["uses_deltas"] = False @@ -561,10 +569,10 @@ def add_model(self, source: str) -> None: """ for f in self.files: if os.path.exists(os.path.join(source, f)): - copyfile(os.path.join(source, f), os.path.join(self.dirname, f)) + copyfile(os.path.join(source, f), self.dirname.joinpath(f)) def add_pronunciation_models( - self, source: str, dictionary_base_names: Collection[str] + self, source: Path, dictionary_base_names: Collection[str] ) -> None: """ Add file into archive @@ -578,10 +586,10 @@ def add_pronunciation_models( """ for base_name in dictionary_base_names: for f in [f"{base_name}.fst", f"{base_name}_align.fst"]: - if os.path.exists(os.path.join(source, f)): - copyfile(os.path.join(source, f), os.path.join(self.dirname, f)) + if source.joinpath(f).exists(): + copyfile(source.joinpath(f), self.dirname.joinpath(f)) - def export_model(self, destination: str) -> None: + def export_model(self, destination: Path) -> None: """ Extract the model files to a new directory @@ -590,10 +598,10 @@ def export_model(self, destination: str) -> None: destination: str Destination directory to extract files to """ - os.makedirs(destination, exist_ok=True) + destination.mkdir(parents=True, exist_ok=True) for f in self.files: - if os.path.exists(os.path.join(self.dirname, f)): - copyfile(os.path.join(self.dirname, f), os.path.join(destination, f)) + if os.path.exists(self.dirname.joinpath(f)): + copyfile(self.dirname.joinpath(f), destination.joinpath(f)) def log_details(self) -> None: """ @@ -603,10 +611,10 @@ def log_details(self) -> None: logger.debug("====ACOUSTIC MODEL INFO====") logger.debug("Acoustic model root directory: " + str(self.root_directory)) logger.debug("Acoustic model dirname: " + str(self.dirname)) - meta_path = os.path.join(self.dirname, "meta.json") + meta_path = self.dirname.joinpath("meta.json") if not os.path.exists(meta_path): - meta_path = os.path.join(self.dirname, "meta.yaml") - logger.debug("Acoustic model meta path: " + meta_path) + meta_path = self.dirname.joinpath("meta.yaml") + logger.debug("Acoustic model meta path: " + str(meta_path)) if not os.path.exists(meta_path): logger.debug("META.YAML DOES NOT EXIST, this may cause issues in validating the model") logger.debug("Acoustic model meta information:") @@ -659,7 +667,11 @@ class IvectorExtractorModel(Archive): ".zip", ] - def __init__(self, source: str, root_directory: Optional[str] = None): + def __init__( + self, + source: typing.Union[str, Path], + root_directory: Optional[typing.Union[str, Path]] = None, + ): if source in IvectorExtractorModel.get_available_models(): source = IvectorExtractorModel.get_pretrained_path(source) @@ -684,7 +696,7 @@ def add_model(self, source: str) -> None: """ for filename in self.model_files: if os.path.exists(os.path.join(source, filename)): - copyfile(os.path.join(source, filename), os.path.join(self.dirname, filename)) + copyfile(os.path.join(source, filename), self.dirname.joinpath(filename)) def export_model(self, destination: str) -> None: """ @@ -697,8 +709,8 @@ def export_model(self, destination: str) -> None: """ os.makedirs(destination, exist_ok=True) for filename in self.model_files: - if os.path.exists(os.path.join(self.dirname, filename)): - copyfile(os.path.join(self.dirname, filename), os.path.join(destination, filename)) + if os.path.exists(self.dirname.joinpath(filename)): + copyfile(self.dirname.joinpath(filename), os.path.join(destination, filename)) class G2PModel(Archive): @@ -717,7 +729,11 @@ class G2PModel(Archive): model_type = "g2p" - def __init__(self, source: str, root_directory: Optional[str] = None): + def __init__( + self, + source: typing.Union[str, Path], + root_directory: Optional[typing.Union[str, Path]] = None, + ): if source in G2PModel.get_available_models(): source = G2PModel.get_pretrained_path(source) @@ -733,17 +749,17 @@ def add_meta_file(self, g2p_trainer: G2PTrainer) -> None: Trainer for the G2P model """ - with mfa_open(os.path.join(self.dirname, "meta.json"), "w") as f: + with mfa_open(self.dirname.joinpath("meta.json"), "w") as f: json.dump(g2p_trainer.meta, f, cls=EnhancedJSONEncoder) @property def meta(self) -> dict: """Metadata for the G2P model""" if not self._meta: - meta_path = os.path.join(self.dirname, "meta.json") + meta_path = self.dirname.joinpath("meta.json") format = "json" if not os.path.exists(meta_path): - meta_path = os.path.join(self.dirname, "meta.yaml") + meta_path = self.dirname.joinpath("meta.yaml") format = "yaml" if not os.path.exists(meta_path): self._meta = {"version": "0.9.0", "architecture": "phonetisaurus"} @@ -752,7 +768,7 @@ def meta(self) -> dict: if format == "json": self._meta = json.load(f) else: - self._meta = yaml.safe_load(f) + self._meta = yaml.load(f, Loader=yaml.Loader) self._meta["phones"] = set(self._meta.get("phones", [])) self._meta["graphemes"] = set(self._meta.get("graphemes", [])) self._meta["evaluation"] = self._meta.get("evaluation", []) @@ -760,27 +776,27 @@ def meta(self) -> dict: return self._meta @property - def fst_path(self) -> str: + def fst_path(self) -> Path: """G2P model's FST path""" - return os.path.join(self.dirname, "model.fst") + return self.dirname.joinpath("model.fst") @property - def sym_path(self) -> str: + def sym_path(self) -> Path: """G2P model's symbols path""" - path = os.path.join(self.dirname, "phones.txt") - if os.path.exists(path): + path = self.dirname.joinpath("phones.txt") + if path.exists(): return path - return os.path.join(self.dirname, "phones.sym") + return self.dirname.joinpath("phones.sym") @property - def grapheme_sym_path(self) -> str: + def grapheme_sym_path(self) -> Path: """G2P model's grapheme symbols path""" - path = os.path.join(self.dirname, "graphemes.txt") - if os.path.exists(path): + path = self.dirname.joinpath("graphemes.txt") + if path.exists(): return path - return os.path.join(self.dirname, "graphemes.sym") + return self.dirname.joinpath("graphemes.sym") - def add_sym_path(self, source_directory: str) -> None: + def add_sym_path(self, source_directory: Path) -> None: """ Add symbols file into archive @@ -796,7 +812,7 @@ def add_sym_path(self, source_directory: str) -> None: ): copyfile(os.path.join(source_directory, "graphemes.txt"), self.grapheme_sym_path) - def add_fst_model(self, source_directory: str) -> None: + def add_fst_model(self, source_directory: Path) -> None: """ Add FST file into archive @@ -805,7 +821,7 @@ def add_fst_model(self, source_directory: str) -> None: source_directory: str Source directory path """ - if not os.path.exists(self.fst_path): + if not self.fst_path.exists(): copyfile(os.path.join(source_directory, "model.fst"), self.fst_path) def export_fst_model(self, destination: str) -> None: @@ -844,6 +860,122 @@ def validate(self, word_list: Collection[str]) -> bool: return True +class TokenizerModel(Archive): + """ + Class for Tokenizer models + + Parameters + ---------- + source: str + Path to source archive + root_directory: str + Path to save exported model + """ + + extensions = [".zip", ".tkn"] + + model_type = "tokenizer" + + def __init__( + self, + source: typing.Union[str, Path], + root_directory: Optional[typing.Union[str, Path]] = None, + ): + if source in TokenizerModel.get_available_models(): + source = TokenizerModel.get_pretrained_path(source) + + super().__init__(source, root_directory) + + def add_meta_file(self, g2p_trainer: TokenizerTrainer) -> None: + """ + Construct metadata information for the G2P model from the dictionary it was trained from + + Parameters + ---------- + g2p_trainer: :class:`~montreal_forced_aligner.g2p.trainer.G2PTrainer` + Trainer for the G2P model + """ + + with mfa_open(self.dirname.joinpath("meta.json"), "w") as f: + json.dump(g2p_trainer.meta, f, cls=EnhancedJSONEncoder) + + @property + def meta(self) -> dict: + """Metadata for the G2P model""" + if not self._meta: + meta_path = self.dirname.joinpath("meta.json") + format = "json" + if not os.path.exists(meta_path): + meta_path = self.dirname.joinpath("meta.yaml") + format = "yaml" + if not os.path.exists(meta_path): + self._meta = {"version": "0.9.0", "architecture": "pynini"} + else: + with mfa_open(meta_path, "r") as f: + if format == "json": + self._meta = json.load(f) + else: + self._meta = yaml.load(f, Loader=yaml.Loader) + self._meta["evaluation"] = self._meta.get("evaluation", []) + self._meta["training"] = self._meta.get("training", []) + return self._meta + + @property + def fst_path(self) -> Path: + """Tokenizer model's FST path""" + return self.dirname.joinpath("tokenizer.fst") + + @property + def sym_path(self) -> Path: + """Tokenizer model's grapheme symbols path""" + path = self.dirname.joinpath("graphemes.txt") + if path.exists(): + return path + return self.dirname.joinpath("graphemes.sym") + + def add_graphemes_path(self, source_directory: Path) -> None: + """ + Add symbols file into archive + + Parameters + ---------- + source_directory: :class:`~pathlib.Path` + Source directory path + """ + if not self.sym_path.exists(): + copyfile(source_directory.joinpath("graphemes.txt"), self.sym_path) + + def add_tokenizer_model(self, source_directory: Path) -> None: + """ + Add FST file into archive + + Parameters + ---------- + source_directory: :class:`~pathlib.Path` + Source directory path + """ + if not self.fst_path.exists(): + copyfile(source_directory.joinpath("tokenizer.fst"), self.fst_path) + + def export_fst_model(self, destination: Path) -> None: + """ + Extract FST model path to destination + + Parameters + ---------- + destination: :class:`~pathlib.Path` + Destination directory + """ + destination.mkdir(parents=True, exist_ok=True) + copy(self.fst_path, destination) + + def validate(self, *args) -> None: + """ + Placeholder + """ + pass + + class LanguageModel(Archive): """ Class for MFA language models @@ -861,7 +993,11 @@ class LanguageModel(Archive): arpa_extension = ".arpa" extensions = [f".{FORMAT}", arpa_extension, ".lm"] - def __init__(self, source: typing.Union[str, Path], root_directory: Optional[str] = None): + def __init__( + self, + source: typing.Union[str, Path], + root_directory: Optional[typing.Union[str, Path]] = None, + ): if source in LanguageModel.get_available_models(): source = LanguageModel.get_pretrained_path(source) from .config import get_temporary_directory @@ -869,15 +1005,17 @@ def __init__(self, source: typing.Union[str, Path], root_directory: Optional[str if isinstance(source, str): source = Path(source) if root_directory is None: - root_directory = os.path.join( - get_temporary_directory(), "extracted_models", self.model_type + root_directory = get_temporary_directory().joinpath( + "extracted_models", self.model_type ) + if isinstance(root_directory, str): + source = Path(root_directory) if source.suffix == self.arpa_extension: self.root_directory = root_directory self._meta = {} self.name = source.stem - self.dirname = os.path.join(root_directory, f"{self.name}_{self.model_type}") + self.dirname = root_directory.joinpath(f"{self.name}_{self.model_type}") if not os.path.exists(self.dirname): os.makedirs(self.dirname, exist_ok=True) copy(source, self.large_arpa_path) @@ -907,40 +1045,44 @@ def carpa_path(self) -> str: @property def small_arpa_path(self) -> str: """Small arpa path""" - for file in os.listdir(self.dirname): - if file.endswith("_small" + self.arpa_extension): - return os.path.join(self.dirname, file) - return os.path.join(self.dirname, f"{self.name}_small{self.arpa_extension}") + for path in self.dirname.iterdir(): + if path.name.endswith("_small" + self.arpa_extension): + return path + return self.dirname.joinpath(f"{self.name}_small{self.arpa_extension}") @property def medium_arpa_path(self) -> str: """Medium arpa path""" - for file in os.listdir(self.dirname): - if file.endswith("_med" + self.arpa_extension): - return os.path.join(self.dirname, file) - return os.path.join(self.dirname, f"{self.name}_med{self.arpa_extension}") + for path in self.dirname.iterdir(): + if path.name.endswith("_med" + self.arpa_extension): + return path + return self.dirname.joinpath(f"{self.name}_med{self.arpa_extension}") @property def large_arpa_path(self) -> str: """Large arpa path""" - for file in os.listdir(self.dirname): - if file.endswith(self.arpa_extension) and "_small" not in file and "_med" not in file: - return os.path.join(self.dirname, file) - return os.path.join(self.dirname, self.name + self.arpa_extension) + for path in self.dirname.iterdir(): + if ( + path.name.endswith(self.arpa_extension) + and "_small" not in path.name + and "_med" not in path.name + ): + return path + return self.dirname.joinpath(self.name + self.arpa_extension) - def add_arpa_file(self, arpa_path: str) -> None: + def add_arpa_file(self, arpa_path: Path) -> None: """ Adds an ARPA file to the model Parameters ---------- - arpa_path: str + arpa_path: :class:`~pathlib.Path` Path to ARPA file """ output_name = self.large_arpa_path - if arpa_path.endswith("_small.arpa"): + if arpa_path.name.endswith("_small.arpa"): output_name = self.small_arpa_path - elif arpa_path.endswith("_medium.arpa"): + elif arpa_path.name.endswith("_medium.arpa"): output_name = self.medium_arpa_path copyfile(arpa_path, output_name) @@ -951,9 +1093,9 @@ class DictionaryModel(MfaModel): Parameters ---------- - path: Path + path: :class:`~pathlib.Path` Path to the dictionary file - root_directory: Path, optional + root_directory: :class:`~pathlib.Path`, optional Path to working directory (currently not needed, but present to maintain consistency with other MFA Models """ @@ -1112,7 +1254,7 @@ def valid_extension(cls, filename: Path) -> bool: Parameters ---------- - filename: Path + filename: :class:`~pathlib.Path` File name to check Returns @@ -1133,7 +1275,7 @@ def generate_path( Parameters ---------- - root: Path + root: :class:`~pathlib.Path` Root directory for the full path name: str Name of the model @@ -1173,7 +1315,7 @@ def load_dictionary_paths(self) -> Dict[str, Tuple[DictionaryModel, typing.Set[s mapping = {} if self.is_multiple: with mfa_open(self.path, "r") as f: - data = yaml.safe_load(f) + data = yaml.load(f, Loader=yaml.Loader) for speaker, path in data.items(): if path not in mapping: mapping[path] = (DictionaryModel(path), set()) @@ -1189,6 +1331,7 @@ def load_dictionary_paths(self) -> Dict[str, Tuple[DictionaryModel, typing.Set[s "dictionary": DictionaryModel, "language_model": LanguageModel, "ivector": IvectorExtractorModel, + "tokenizer": TokenizerModel, } diff --git a/montreal_forced_aligner/online/alignment.py b/montreal_forced_aligner/online/alignment.py index 047a96c5..0763809f 100644 --- a/montreal_forced_aligner/online/alignment.py +++ b/montreal_forced_aligner/online/alignment.py @@ -4,6 +4,7 @@ import os import subprocess import typing +from pathlib import Path from sqlalchemy.orm import Session @@ -32,7 +33,7 @@ class OnlineAlignmentArguments(MfaArguments): Arguments for performing alignment online on single utterances """ - working_directory: str + working_directory: Path sox_string: str utterance_data: UtteranceData mfcc_options: MetaDict @@ -40,8 +41,8 @@ class OnlineAlignmentArguments(MfaArguments): feature_options: MetaDict lda_options: MetaDict align_options: MetaDict - model_path: str - tree_path: str + model_path: Path + tree_path: Path dictionary_id: int @@ -186,18 +187,18 @@ def _run(self) -> typing.Tuple[typing.List[CtmInterval], typing.List[CtmInterval ) for w_id, pron, p_id in pronunciations: self.pronunciation_mapping[(w_id, pron)] = p_id - wav_path = os.path.join(self.working_directory, "wav.scp") - likelihood_path = os.path.join(self.working_directory, "likelihoods.scp") - feat_path = os.path.join(self.working_directory, "feats.scp") - utt2spk_path = os.path.join(self.working_directory, "utt2spk.scp") - segment_path = os.path.join(self.working_directory, "segments.scp") - text_int_path = os.path.join(self.working_directory, "text.int") - lda_mat_path = os.path.join(self.working_directory, "lda.mat") - fst_path = os.path.join(self.working_directory, "fsts.ark") - mfcc_ark_path = os.path.join(self.working_directory, "mfcc.ark") - pitch_ark_path = os.path.join(self.working_directory, "pitch.ark") - feats_ark_path = os.path.join(self.working_directory, "feats.ark") - ali_path = os.path.join(self.working_directory, "ali.ark") + wav_path = self.working_directory.joinpath("wav.scp") + likelihood_path = self.working_directory.joinpath("likelihoods.scp") + feat_path = self.working_directory.joinpath("feats.scp") + utt2spk_path = self.working_directory.joinpath("utt2spk.scp") + segment_path = self.working_directory.joinpath("segments.scp") + text_int_path = self.working_directory.joinpath("text.int") + lda_mat_path = self.working_directory.joinpath("lda.mat") + fst_path = self.working_directory.joinpath("fsts.ark") + mfcc_ark_path = self.working_directory.joinpath("mfcc.ark") + pitch_ark_path = self.working_directory.joinpath("pitch.ark") + feats_ark_path = self.working_directory.joinpath("feats.ark") + ali_path = self.working_directory.joinpath("ali.ark") min_length = 0.1 if self.align_options["boost_silence"] != 1.0: mdl_string = f"gmm-boost-silence --boost={self.align_options['boost_silence']} {self.align_options['optional_silence_csl']} {self.model_path} - |" diff --git a/montreal_forced_aligner/textgrid.py b/montreal_forced_aligner/textgrid.py index c60c0e8f..e3eeb711 100644 --- a/montreal_forced_aligner/textgrid.py +++ b/montreal_forced_aligner/textgrid.py @@ -92,14 +92,14 @@ def output_textgrid_writing_errors( def parse_aligned_textgrid( - path: str, root_speaker: typing.Optional[str] = None + path: Path, root_speaker: typing.Optional[str] = None ) -> Dict[str, List[CtmInterval]]: """ Load a TextGrid as a dictionary of speaker's phone tiers Parameters ---------- - path: str + path: :class:`~pathlib.Path` TextGrid file to parse root_speaker: str, optional Optional speaker if the TextGrid has no speaker information @@ -156,7 +156,7 @@ def export_textgrid( ---------- speaker_data: dict[Speaker, dict[str, list[:class:`~montreal_forced_aligner.data.CtmInterval`]] Per speaker, per word/phone :class:`~montreal_forced_aligner.data.CtmInterval` - output_path: str + output_path: :class:`~pathlib.Path` Output path of the file duration: float Duration of the file diff --git a/montreal_forced_aligner/tokenization/__init__.py b/montreal_forced_aligner/tokenization/__init__.py new file mode 100644 index 00000000..7ea93a63 --- /dev/null +++ b/montreal_forced_aligner/tokenization/__init__.py @@ -0,0 +1,6 @@ +"""Tokenization classes""" + +from montreal_forced_aligner.tokenization.tokenizer import CorpusTokenizer, TokenizerValidator +from montreal_forced_aligner.tokenization.trainer import TokenizerTrainer + +__all__ = ["TokenizerTrainer", "TokenizerValidator", "CorpusTokenizer"] diff --git a/montreal_forced_aligner/tokenization/tokenizer.py b/montreal_forced_aligner/tokenization/tokenizer.py new file mode 100644 index 00000000..1ff162c3 --- /dev/null +++ b/montreal_forced_aligner/tokenization/tokenizer.py @@ -0,0 +1,439 @@ +"""Classes for tokenizers""" + +import csv +import functools +import logging +import multiprocessing as mp +import os +import queue +import time +import typing +from pathlib import Path + +import pynini +import pywrapfst +import sqlalchemy +import tqdm +from praatio import textgrid +from pynini import Fst +from pynini.lib import rewrite +from pywrapfst import SymbolTable +from sqlalchemy.orm import joinedload, selectinload + +from montreal_forced_aligner.abc import KaldiFunction, TopLevelMfaWorker +from montreal_forced_aligner.alignment.multiprocessing import construct_output_path +from montreal_forced_aligner.config import GLOBAL_CONFIG +from montreal_forced_aligner.corpus.text_corpus import TextCorpusMixin +from montreal_forced_aligner.data import MfaArguments, TextgridFormats +from montreal_forced_aligner.db import File, Utterance, bulk_update +from montreal_forced_aligner.dictionary.mixins import DictionaryMixin +from montreal_forced_aligner.exceptions import PyniniGenerationError +from montreal_forced_aligner.g2p.generator import Rewriter, RewriterWorker +from montreal_forced_aligner.helper import edit_distance, mfa_open +from montreal_forced_aligner.models import TokenizerModel +from montreal_forced_aligner.utils import Stopped, run_kaldi_function + +if typing.TYPE_CHECKING: + from dataclasses import dataclass +else: + from dataclassy import dataclass + +__all__ = [ + "TokenizerRewriter", + "TokenizerArguments", + "TokenizerFunction", + "TokenizerValidator", + "CorpusTokenizer", +] + +logger = logging.getLogger("mfa") + + +class TokenizerRewriter(Rewriter): + """ + Helper object for rewriting + + Parameters + ---------- + fst: pynini.Fst + Tokenizer FST model + grapheme_symbols: pynini.SymbolTable + Grapheme symbol table + """ + + def __init__( + self, + fst: Fst, + grapheme_symbols: SymbolTable, + ): + self.grapheme_symbols = grapheme_symbols + self.rewrite = functools.partial( + rewrite.top_rewrite, + rule=fst, + input_token_type=grapheme_symbols, + output_token_type=grapheme_symbols, + ) + + def __call__(self, i: str) -> str: # pragma: no cover + """Call the rewrite function""" + i = i.replace(" ", "") + original = list(i) + unks = [] + normalized = [] + for c in original: + if self.grapheme_symbols.member(c): + normalized.append(c) + else: + unks.append(c) + normalized.append("") + hypothesis = self.rewrite(" ".join(normalized)).split() + unk_index = 0 + for i, w in enumerate(hypothesis): + if w == "": + hypothesis[i] = unks[unk_index] + unk_index += 1 + elif w == "": + hypothesis[i] = " " + return "".join(hypothesis) + + +@dataclass +class TokenizerArguments(MfaArguments): + rewriter: Rewriter + + +class TokenizerFunction(KaldiFunction): + def __init__(self, args: TokenizerArguments): + super().__init__(args) + self.rewriter = args.rewriter + + def _run(self) -> typing.Generator: + """Run the function""" + engine = sqlalchemy.create_engine(self.db_string) + with sqlalchemy.orm.Session(engine) as session: + utterances = session.query(Utterance.id, Utterance.normalized_text).filter( + Utterance.job_id == self.job_name + ) + for u_id, text in utterances: + tokenized_text = self.rewriter(text) + yield u_id, tokenized_text + + +class CorpusTokenizer(TextCorpusMixin, TopLevelMfaWorker, DictionaryMixin): + """ + Top-level worker for generating pronunciations from a corpus and a Pynini tokenizer model + """ + + model_class = TokenizerModel + + def __init__(self, tokenizer_model_path: Path = None, **kwargs): + super().__init__(**kwargs) + self.tokenizer_model = TokenizerModel( + tokenizer_model_path, root_directory=getattr(self, "workflow_directory", None) + ) + + def setup(self) -> None: + """Set up the pronunciation generator""" + if self.initialized: + return + self._load_corpus() + self.initialize_jobs() + super().setup() + self._create_dummy_dictionary() + self.normalize_text() + self.fst = pynini.Fst.read(self.tokenizer_model.fst_path) + self.grapheme_symbols = pywrapfst.SymbolTable.read_text(self.tokenizer_model.sym_path) + + self.rewriter = TokenizerRewriter( + self.fst, + self.grapheme_symbols, + ) + self.initialized = True + + def export_files(self, output_directory: Path) -> None: + """Export transcriptions""" + with self.session() as session: + files = session.query(File).options( + selectinload(File.utterances), + selectinload(File.speakers), + joinedload(File.sound_file), + ) + for file in files: + utterance_count = len(file.utterances) + if file.sound_file is not None: + duration = file.sound_file.duration + else: + duration = max([u.end for u in file.utterances]) + if utterance_count == 0: + logger.debug(f"Could not find any utterances for {file.name}") + elif ( + utterance_count == 1 + and file.utterances[0].begin == 0 + and file.utterances[0].end == duration + ): + output_format = "lab" + else: + output_format = TextgridFormats.SHORT_TEXTGRID + output_path = construct_output_path( + file.name, + file.relative_path, + output_directory, + output_format=output_format, + ) + data = file.construct_transcription_tiers(original_text=True) + if output_format == "lab": + for intervals in data.values(): + with mfa_open(output_path, "w") as f: + f.write(intervals["text"][0].label) + else: + tg = textgrid.Textgrid() + tg.minTimestamp = 0 + tg.maxTimestamp = round(duration, 5) + for speaker in file.speakers: + speaker = speaker.name + intervals = data[speaker]["text"] + tier = textgrid.IntervalTier( + speaker, + [x.to_tg_interval() for x in intervals], + minT=0, + maxT=round(duration, 5), + ) + + tg.addTier(tier) + tg.save(output_path, includeBlankSpaces=True, format=output_format) + + def tokenize_arguments(self) -> typing.List[TokenizerArguments]: + return [TokenizerArguments(j.id, self.db_string, None, self.rewriter) for j in self.jobs] + + def tokenize_utterances(self) -> None: + """ + Tokenize utterances + + Returns + ------- + dict[str, list[str]] + Mappings of keys to their tokenized utterances + """ + begin = time.time() + if not self.initialized: + self.setup() + logger.info("Tokenizing utterances...") + args = self.tokenize_arguments() + with tqdm.tqdm(total=self.num_utterances, disable=GLOBAL_CONFIG.quiet) as pbar: + update_mapping = [] + for utt_id, tokenized in run_kaldi_function(TokenizerFunction, args, pbar.update): + update_mapping.append({"id": utt_id, "text": tokenized}) + with self.session() as session: + bulk_update(session, Utterance, update_mapping) + session.commit() + + logger.debug(f"Tokenizing utterances took {time.time() - begin:.3f} seconds") + + +class TokenizerValidator(CorpusTokenizer): + def __init__(self, utterances_to_tokenize: typing.List[str] = None, **kwargs): + super().__init__(**kwargs) + if utterances_to_tokenize is None: + utterances_to_tokenize = [] + self.utterances_to_tokenize = utterances_to_tokenize + + def setup(self): + TopLevelMfaWorker.setup(self) + if self.initialized: + return + self._current_workflow = "validation" + os.makedirs(self.working_log_directory, exist_ok=True) + self.fst = pynini.Fst.read(self.tokenizer_model.fst_path) + self.grapheme_symbols = pywrapfst.SymbolTable.read_text(self.tokenizer_model.sym_path) + + self.rewriter = TokenizerRewriter( + self.fst, + self.grapheme_symbols, + ) + self.initialized = True + self.uer = None + self.cer = None + + def tokenize_utterances(self) -> typing.Dict[str, str]: + """ + Tokenize utterances + + Returns + ------- + dict[str, list[str]] + Mappings of keys to their tokenized utterances + """ + num_utterances = len(self.utterances_to_tokenize) + begin = time.time() + if not self.initialized: + self.setup() + logger.info("Tokenizing utterances...") + to_return = {} + if True or num_utterances < 30 or GLOBAL_CONFIG.num_jobs == 1: + with tqdm.tqdm(total=num_utterances, disable=GLOBAL_CONFIG.quiet) as pbar: + for utterance in self.utterances_to_tokenize: + pbar.update(1) + result = self.rewriter(utterance) + to_return[utterance] = result + else: + stopped = Stopped() + job_queue = mp.Queue() + for utterance in self.utterances_to_tokenize: + job_queue.put(utterance) + error_dict = {} + return_queue = mp.Queue() + procs = [] + for _ in range(GLOBAL_CONFIG.num_jobs): + p = RewriterWorker( + job_queue, + return_queue, + self.rewriter, + stopped, + ) + procs.append(p) + p.start() + with tqdm.tqdm(total=num_utterances, disable=GLOBAL_CONFIG.quiet) as pbar: + while True: + try: + utterance, result = return_queue.get(timeout=1) + if stopped.stop_check(): + continue + except queue.Empty: + for proc in procs: + if not proc.finished.stop_check(): + break + else: + break + continue + pbar.update(1) + if isinstance(result, Exception): + error_dict[utterance] = result + continue + to_return[utterance] = result + + for p in procs: + p.join() + if error_dict: + raise PyniniGenerationError(error_dict) + logger.debug(f"Processed {num_utterances} in {time.time() - begin:.3f} seconds") + return to_return + + @property + def data_source_identifier(self) -> str: + """Dummy "validation" data source""" + return "validation" + + @property + def data_directory(self) -> Path: + """Data directory""" + return self.working_directory + + @property + def evaluation_csv_path(self) -> Path: + """Path to working directory's CSV file""" + return self.working_directory.joinpath("pronunciation_evaluation.csv") + + def compute_validation_errors( + self, + gold_values: typing.Dict[str, str], + hypothesis_values: typing.Dict[str, str], + ): + """ + Computes validation errors + + Parameters + ---------- + gold_values: dict[str, set[str]] + Gold pronunciations + hypothesis_values: dict[str, list[str]] + Hypothesis pronunciations + """ + begin = time.time() + # Word-level measures. + correct = 0 + incorrect = 0 + # Label-level measures. + total_edits = 0 + total_length = 0 + # Since the edit distance algorithm is quadratic, let's do this with + # multiprocessing. + logger.debug(f"Processing results for {len(hypothesis_values)} hypotheses") + to_comp = [] + indices = [] + output = [] + for word, gold in gold_values.items(): + if word not in hypothesis_values: + incorrect += 1 + gold_length = len(gold) + total_edits += gold_length + total_length += gold_length + output.append( + { + "Word": word, + "Gold tokenization": gold, + "Hypothesis tokenization": "", + "Accuracy": 0, + "Error rate": 1.0, + "Length": gold_length, + } + ) + continue + hyp = hypothesis_values[word] + if hyp == gold: + correct += 1 + total_length += len(hyp) + output.append( + { + "Word": word, + "Gold tokenization": gold, + "Hypothesis tokenization": hyp, + "Accuracy": 1, + "Error rate": 0.0, + "Length": len(hyp), + } + ) + else: + incorrect += 1 + indices.append(word) + to_comp.append((gold, hyp)) # Multiple hypotheses to compare + with mp.Pool(GLOBAL_CONFIG.num_jobs) as pool: + gen = pool.starmap(edit_distance, to_comp) + for i, (edits) in enumerate(gen): + word = indices[i] + gold = gold_values[word] + length = len(gold) + hyp = hypothesis_values[word] + output.append( + { + "Word": word, + "Gold tokenization": gold, + "Hypothesis tokenization": hyp, + "Accuracy": 1, + "Error rate": edits / length, + "Length": length, + } + ) + total_edits += edits + total_length += length + with mfa_open(self.evaluation_csv_path, "w") as f: + writer = csv.DictWriter( + f, + fieldnames=[ + "Word", + "Gold tokenization", + "Hypothesis tokenization", + "Accuracy", + "Error rate", + "Length", + ], + ) + writer.writeheader() + for line in output: + writer.writerow(line) + self.uer = 100 * incorrect / (correct + incorrect) + self.cer = 100 * total_edits / total_length + logger.info(f"UER:\t{self.uer:.2f}") + logger.info(f"CER:\t{self.cer:.2f}") + logger.debug( + f"Computation of errors for {len(gold_values)} utterances took {time.time() - begin:.3f} seconds" + ) diff --git a/montreal_forced_aligner/tokenization/trainer.py b/montreal_forced_aligner/tokenization/trainer.py new file mode 100644 index 00000000..4fc64d3e --- /dev/null +++ b/montreal_forced_aligner/tokenization/trainer.py @@ -0,0 +1,286 @@ +"""Classes for training tokenizers""" +import collections +import logging +import os +import shutil +import subprocess +import time +from pathlib import Path + +import pywrapfst +import sqlalchemy + +from montreal_forced_aligner.abc import MetaDict, TopLevelMfaWorker +from montreal_forced_aligner.config import GLOBAL_CONFIG +from montreal_forced_aligner.corpus.text_corpus import TextCorpusMixin +from montreal_forced_aligner.data import WorkflowType +from montreal_forced_aligner.db import Utterance +from montreal_forced_aligner.dictionary.mixins import DictionaryMixin +from montreal_forced_aligner.exceptions import KaldiProcessingError +from montreal_forced_aligner.g2p.trainer import G2PTrainer, PyniniTrainerMixin +from montreal_forced_aligner.helper import mfa_open +from montreal_forced_aligner.models import TokenizerModel +from montreal_forced_aligner.tokenization.tokenizer import TokenizerValidator +from montreal_forced_aligner.utils import log_kaldi_errors, thirdparty_binary + +__all__ = ["TokenizerTrainer"] + +logger = logging.getLogger("mfa") + + +class TokenizerTrainer( + PyniniTrainerMixin, TextCorpusMixin, G2PTrainer, TopLevelMfaWorker, DictionaryMixin +): + def __init__(self, oov_count_threshold=5, **kwargs): + super().__init__(oov_count_threshold=oov_count_threshold, **kwargs) + self.training_graphemes = set() + self.uer = None + self.cer = None + self.deletions = False + self.insertions = True + + def setup(self) -> None: + super().setup() + self.ignore_empty_utterances = True + if self.initialized: + return + try: + self._load_corpus() + self._create_dummy_dictionary() + self.initialize_jobs() + self.normalize_text() + self.initialize_training() + except Exception as e: + if isinstance(e, KaldiProcessingError): + log_kaldi_errors(e.error_logs) + e.update_log_file() + raise + self.initialized = True + + @property + def meta(self) -> MetaDict: + """Metadata for exported tokenizer model""" + from datetime import datetime + + from ..utils import get_mfa_version + + m = { + "version": get_mfa_version(), + "architecture": self.architecture, + "train_date": str(datetime.now()), + "evaluation": {}, + "training": { + "num_utterances": self.num_training_utterances, + "num_graphemes": len(self.training_graphemes), + }, + } + + if self.evaluation_mode: + m["evaluation"]["num_utterances"] = self.num_validation_utterances + m["evaluation"]["utterance_error_rate"] = self.uer + m["evaluation"]["character_error_rate"] = self.cer + return m + + @property + def data_source_identifier(self) -> str: + """Corpus name""" + return self.corpus_directory.name + + @property + def sym_path(self) -> Path: + return self.working_directory.joinpath("graphemes.txt") + + @property + def phone_symbol_table_path(self) -> Path: + return self.working_directory.joinpath("graphemes.txt") + + def initialize_training(self) -> None: + """Initialize training tokenizer model""" + self.create_new_current_workflow(WorkflowType.tokenizer_training) + with self.session() as session: + self.num_validation_utterances = 0 + self.num_training_utterances = 0 + self.num_iterations = 2 + self.input_token_type = self.working_directory.joinpath("graphemes.txt") + if self.evaluation_mode: + validation_items = int(self.num_utterances * self.validation_proportion) + validation_utterances = ( + sqlalchemy.select(Utterance.id) + .order_by(sqlalchemy.func.random()) + .limit(validation_items) + .scalar_subquery() + ) + query = ( + sqlalchemy.update(Utterance) + .execution_options(synchronize_session="fetch") + .values(ignored=True) + .where(Utterance.id.in_(validation_utterances)) + ) + with session.begin_nested(): + session.execute(query) + session.flush() + session.commit() + self.num_validation_utterances = ( + session.query(Utterance.id).filter(Utterance.ignored == True).count() # noqa + ) + + query = session.query(Utterance.normalized_character_text).filter( + Utterance.ignored == False # noqa + ) + unk_character = "" + self.training_graphemes.add(unk_character) + counts = collections.Counter() + for (text,) in query: + counts.update(text.split()) + with mfa_open(self.input_path, "w") as untokenized_f, mfa_open( + self.output_path, "w" + ) as tokenized_f: + for (text,) in query: + assert text + tokenized = [ + x if counts[x] >= self.oov_count_threshold else unk_character + for x in text.split() + ] + untokenized = [x for x in tokenized if x != ""] + self.num_training_utterances += 1 + self.training_graphemes.update(tokenized) + untokenized_f.write(" ".join(untokenized) + "\n") + tokenized_f.write(" ".join(tokenized) + "\n") + index = 1 + with mfa_open(self.sym_path, "w") as f: + f.write("\t0\n") + for g in sorted(self.training_graphemes): + f.write(f"{g}\t{index}\n") + index += 1 + + def _lexicon_covering(self, input_path=None, output_path=None) -> None: + """Builds covering grammar and lexicon FARs.""" + # Sets of labels for the covering grammar. + with mfa_open( + self.working_log_directory.joinpath("covering_grammar.log"), "w" + ) as log_file: + if input_path is None: + input_path = self.input_path + if output_path is None: + output_path = self.output_path + com = [ + thirdparty_binary("farcompilestrings"), + "--fst_type=compact", + ] + if self.input_token_type != "utf8": + com.append("--token_type=symbol") + com.append( + f"--symbols={self.input_token_type}", + ) + com.append("--unknown_symbol=") + else: + com.append("--token_type=utf8") + com.extend([input_path, self.input_far_path]) + print(" ".join(map(str, com)), file=log_file) + subprocess.check_call(com, env=os.environ, stderr=log_file, stdout=log_file) + com = [ + thirdparty_binary("farcompilestrings"), + "--fst_type=compact", + "--token_type=symbol", + f"--symbols={self.phone_symbol_table_path}", + output_path, + self.output_far_path, + ] + print(" ".join(map(str, com)), file=log_file) + subprocess.check_call(com, env=os.environ, stderr=log_file, stdout=log_file) + cg = pywrapfst.VectorFst() + state = cg.add_state() + cg.set_start(state) + labels = pywrapfst.SymbolTable.read_text(self.sym_path) + one = pywrapfst.Weight.one(cg.weight_type()) + for i in range(labels.num_symbols()): + if labels.find(i) == "": + continue + cg.add_arc(state, pywrapfst.Arc(i, i, one, state)) + olabel = labels.find("") + cg.add_arc(state, pywrapfst.Arc(0, olabel, one, state)) + cg.set_final(state) + assert cg.verify(), "Label acceptor is ill-formed" + cg.write(self.cg_path) + + def evaluate_tokenizer(self) -> None: + """ + Validate the tokenizer model against held out data + """ + temp_model_path = self.working_log_directory.joinpath("tokenizer_model.zip") + self.export_model(temp_model_path) + temp_dir = self.working_directory.joinpath("validation") + temp_dir.mkdir(parents=True, exist_ok=True) + with self.session() as session: + validation_set = {} + query = session.query(Utterance.normalized_character_text).filter( + Utterance.ignored == True # noqa + ) + for (text,) in query: + tokenized = text.split() + untokenized = [x for x in tokenized if x != ""] + tokenized = [x if x != "" else " " for x in tokenized] + validation_set[" ".join(untokenized)] = "".join(tokenized) + gen = TokenizerValidator( + tokenizer_model_path=temp_model_path, + corpus_directory=self.corpus_directory, + utterances_to_tokenize=list(validation_set.keys()), + ) + output = gen.tokenize_utterances() + with mfa_open(temp_dir.joinpath("validation_output.txt"), "w") as f: + for (orthography, pronunciations) in output.items(): + if not pronunciations: + continue + for p in pronunciations: + if not p: + continue + f.write(f"{orthography}\t{p}\n") + gen.compute_validation_errors(validation_set, output) + self.uer = gen.uer + self.cer = gen.cer + + def finalize_training(self) -> None: + """Finalize training""" + shutil.copyfile(self.fst_path, self.working_directory.joinpath("tokenizer.fst")) + if self.evaluation_mode: + self.evaluate_tokenizer() + + def train(self) -> None: + """ + Train a tokenizer model + """ + os.makedirs(self.working_log_directory, exist_ok=True) + begin = time.time() + if os.path.exists(self.far_path) and os.path.exists(self.encoder_path): + logger.info("Alignment already done, skipping!") + else: + self.align_g2p() + logger.debug(f"Aligning took {time.time() - begin:.3f} seconds") + begin = time.time() + self.generate_model() + logger.debug(f"Generating model took {time.time() - begin:.3f} seconds") + self.finalize_training() + + def export_model(self, output_model_path: Path) -> None: + """ + Export tokenizer model to specified path + + Parameters + ---------- + output_model_path: :class:`~pathlib.Path` + Path to export model + """ + directory = output_model_path.parent + + models_temp_dir = self.working_directory.joinpath("model_archive_temp") + model = TokenizerModel.empty(output_model_path.stem, root_directory=models_temp_dir) + model.add_meta_file(self) + model.add_tokenizer_model(self.working_directory) + model.add_graphemes_path(self.working_directory) + if directory: + os.makedirs(directory, exist_ok=True) + model.dump(output_model_path) + if not GLOBAL_CONFIG.current_profile.debug: + model.clean_up() + # self.clean_up() + logger.info(f"Saved model to {output_model_path}") diff --git a/montreal_forced_aligner/transcription/multiprocessing.py b/montreal_forced_aligner/transcription/multiprocessing.py index 76a9b811..a158924d 100644 --- a/montreal_forced_aligner/transcription/multiprocessing.py +++ b/montreal_forced_aligner/transcription/multiprocessing.py @@ -9,6 +9,7 @@ import re import subprocess import typing +from pathlib import Path from typing import TYPE_CHECKING, Dict, List, TextIO import pynini @@ -22,7 +23,6 @@ if TYPE_CHECKING: from dataclasses import dataclass - else: from dataclassy import dataclass @@ -55,27 +55,25 @@ class CreateHclgArguments(MfaArguments): Integer ID of the job db_string: str String for database connections - log_path: str + log_path: :class:`~pathlib.Path` Path to save logging information during the run - working_directory: str + working_directory: :class:`~pathlib.Path` Current working directory - path_template: str - Path template for intermediate files - words_path: str + words_path: :class:`~pathlib.Path` Path to words symbol table - carpa_path: str + carpa_path: :class:`~pathlib.Path` Path to .carpa file - small_arpa_path: str + small_arpa_path: :class:`~pathlib.Path` Path to small ARPA file - medium_arpa_path: str + medium_arpa_path: :class:`~pathlib.Path` Path to medium ARPA file - big_arpa_path: str + big_arpa_path: :class:`~pathlib.Path` Path to big ARPA file - model_path: str + model_path: :class:`~pathlib.Path` Acoustic model path - disambig_L_path: str + disambig_L_path: :class:`~pathlib.Path` Path to disambiguated lexicon file - disambig_int_path: str + disambig_int_path: :class:`~pathlib.Path` Path to disambiguation symbol integer file hclg_options: dict[str, Any] HCLG options @@ -83,24 +81,18 @@ class CreateHclgArguments(MfaArguments): Words mapping """ - working_directory: str - path_template: str - words_path: str - carpa_path: str - small_arpa_path: str - medium_arpa_path: str - big_arpa_path: str - model_path: str - disambig_L_path: str - disambig_int_path: str + working_directory: Path + words_path: Path + carpa_path: Path + small_arpa_path: Path + medium_arpa_path: Path + big_arpa_path: Path + model_path: Path + disambig_L_path: Path + disambig_int_path: Path hclg_options: MetaDict words_mapping: Dict[str, int] - @property - def hclg_path(self) -> str: - """Path to HCLG FST file""" - return self.path_template.format(file_name="HCLG") - @dataclass class DecodeArguments(MfaArguments): @@ -113,7 +105,7 @@ class DecodeArguments(MfaArguments): Integer ID of the job db_string: str String for database connections - log_path: str + log_path: :class:`~pathlib.Path` Path to save logging information during the run dictionaries: list[int] List of dictionary ids @@ -121,23 +113,23 @@ class DecodeArguments(MfaArguments): Mapping of dictionaries to feature generation strings decode_options: dict[str, Any] Decoding options - model_path: str + model_path: :class:`~pathlib.Path` Path to model file - lat_paths: dict[int, str] + lat_paths: dict[int, Path] Per dictionary lattice paths - word_symbol_paths: dict[int, str] + word_symbol_paths: dict[int, Path] Per dictionary word symbol table paths - hclg_paths: dict[int, str] + hclg_paths: dict[int, Path] Per dictionary HCLG.fst paths """ dictionaries: List[int] feature_strings: Dict[int, str] decode_options: MetaDict - model_path: str - lat_paths: Dict[int, str] - word_symbol_paths: Dict[int, str] - hclg_paths: Dict[int, str] + model_path: Path + lat_paths: Dict[int, Path] + word_symbol_paths: Dict[int, Path] + hclg_paths: Dict[int, Path] @dataclass @@ -151,7 +143,7 @@ class DecodePhoneArguments(MfaArguments): Integer ID of the job db_string: str String for database connections - log_path: str + log_path: :class:`~pathlib.Path` Path to save logging information during the run dictionaries: list[int] List of dictionary ids @@ -159,23 +151,23 @@ class DecodePhoneArguments(MfaArguments): Mapping of dictionaries to feature generation strings decode_options: dict[str, Any] Decoding options - model_path: str + model_path: :class:`~pathlib.Path` Path to model file - lat_paths: dict[int, str] + lat_paths: dict[int, Path] Per dictionary lattice paths - phone_symbol_path: str + phone_symbol_path: :class:`~pathlib.Path` Phone symbol table paths - hclg_path: str + hclg_path: :class:`~pathlib.Path` HCLG.fst paths """ dictionaries: List[int] feature_strings: Dict[int, str] decode_options: MetaDict - model_path: str - lat_paths: Dict[int, str] - phone_symbol_path: str - hclg_path: str + model_path: Path + lat_paths: Dict[int, Path] + phone_symbol_path: Path + hclg_path: Path @dataclass @@ -189,28 +181,28 @@ class LmRescoreArguments(MfaArguments): Integer ID of the job db_string: str String for database connections - log_path: str + log_path: :class:`~pathlib.Path` Path to save logging information during the run dictionaries: list[int] List of dictionary ids lm_rescore_options: dict[str, Any] Rescoring options - lat_paths: dict[int, str] + lat_paths: dict[int, Path] Per dictionary lattice paths - rescored_lat_paths: dict[int, str] + rescored_lat_paths: dict[int, Path] Per dictionary rescored lattice paths - old_g_paths: dict[int, str] + old_g_paths: dict[int, Path] Mapping of dictionaries to small G.fst paths - new_g_paths: dict[int, str] + new_g_paths: dict[int, Path] Mapping of dictionaries to medium G.fst paths """ dictionaries: List[int] lm_rescore_options: MetaDict - lat_paths: Dict[int, str] - rescored_lat_paths: Dict[int, str] - old_g_paths: Dict[int, str] - new_g_paths: Dict[int, str] + lat_paths: Dict[int, Path] + rescored_lat_paths: Dict[int, Path] + old_g_paths: Dict[int, Path] + new_g_paths: Dict[int, Path] @dataclass @@ -224,25 +216,25 @@ class CarpaLmRescoreArguments(MfaArguments): Integer ID of the job db_string: str String for database connections - log_path: str + log_path: :class:`~pathlib.Path` Path to save logging information during the run dictionaries: list[int] List of dictionary ids - lat_paths: dict[int, str] + lat_paths: dict[int, Path] Per dictionary lattice paths - rescored_lat_paths: dict[int, str] + rescored_lat_paths: dict[int, Path] Per dictionary rescored lattice paths - old_g_paths: dict[int, str] + old_g_paths: dict[int, Path] Mapping of dictionaries to medium G.fst paths - new_g_paths: dict[int, str] + new_g_paths: dict[int, Path] Mapping of dictionaries to G.carpa paths """ dictionaries: List[int] - lat_paths: Dict[int, str] - rescored_lat_paths: Dict[int, str] - old_g_paths: Dict[int, str] - new_g_paths: Dict[int, str] + lat_paths: Dict[int, Path] + rescored_lat_paths: Dict[int, Path] + old_g_paths: Dict[int, Path] + new_g_paths: Dict[int, Path] @dataclass @@ -256,31 +248,31 @@ class InitialFmllrArguments(MfaArguments): Integer ID of the job db_string: str String for database connections - log_path: str + log_path: :class:`~pathlib.Path` Path to save logging information during the run dictionaries: list[int] List of dictionary ids feature_strings: dict[int, str] Mapping of dictionaries to feature generation strings - model_path: str + model_path: :class:`~pathlib.Path` Path to model file fmllr_options: dict[str, Any] fMLLR options - pre_trans_paths: dict[int, str] + pre_trans_paths: dict[int, Path] Per dictionary pre-fMLLR lattice paths - lat_paths: dict[int, str] + lat_paths: dict[int, Path] Per dictionary lattice paths - spk2utt_paths: dict[int, str] + spk2utt_paths: dict[int, Path] Per dictionary speaker to utterance mapping paths """ dictionaries: List[int] feature_strings: Dict[int, str] - model_path: str + model_path: Path fmllr_options: MetaDict - pre_trans_paths: Dict[int, str] - lat_paths: Dict[int, str] - spk2utt_paths: Dict[int, str] + pre_trans_paths: Dict[int, Path] + lat_paths: Dict[int, Path] + spk2utt_paths: Dict[int, Path] @dataclass @@ -294,29 +286,29 @@ class LatGenFmllrArguments(MfaArguments): Integer ID of the job db_string: str String for database connections - log_path: str + log_path: :class:`~pathlib.Path` Path to save logging information during the run dictionaries: list[int] List of dictionary ids feature_strings: dict[int, str] Mapping of dictionaries to feature generation strings - model_path: str + model_path: :class:`~pathlib.Path` Path to model file decode_options: dict[str, Any] Decoding options - hclg_paths: dict[int, str] + hclg_paths: dict[int, Path] Per dictionary HCLG.fst paths - tmp_lat_paths: dict[int, str] + tmp_lat_paths: dict[int, Path] Per dictionary temporary lattice paths """ dictionaries: List[int] feature_strings: Dict[int, str] - model_path: str + model_path: Path decode_options: MetaDict - word_symbol_paths: Dict[int, str] - hclg_paths: typing.Union[Dict[int, str], str] - tmp_lat_paths: Dict[int, str] + word_symbol_paths: Dict[int, Path] + hclg_paths: typing.Union[Dict[int, Path], Path] + tmp_lat_paths: Dict[int, Path] @dataclass @@ -330,31 +322,31 @@ class FinalFmllrArguments(MfaArguments): Integer ID of the job db_string: str String for database connections - log_path: str + log_path: :class:`~pathlib.Path` Path to save logging information during the run dictionaries: list[int] List of dictionary ids feature_strings: dict[int, str] Mapping of dictionaries to feature generation strings - model_path: str + model_path: :class:`~pathlib.Path` Path to model file fmllr_options: dict[str, Any] fMLLR options - trans_paths: dict[int, str] + trans_paths: dict[int, Path] Per dictionary transform paths - spk2utt_paths: dict[int, str] + spk2utt_paths: dict[int, Path] Per dictionary speaker to utterance mapping paths - tmp_lat_paths: dict[int, str] + tmp_lat_paths: dict[int, Path] Per dictionary temporary lattice paths """ dictionaries: List[int] feature_strings: Dict[int, str] - model_path: str + model_path: Path fmllr_options: MetaDict - trans_paths: Dict[int, str] - spk2utt_paths: Dict[int, str] - tmp_lat_paths: Dict[int, str] + trans_paths: Dict[int, Path] + spk2utt_paths: Dict[int, Path] + tmp_lat_paths: Dict[int, Path] @dataclass @@ -368,31 +360,31 @@ class FmllrRescoreArguments(MfaArguments): Integer ID of the job db_string: str String for database connections - log_path: str + log_path: :class:`~pathlib.Path` Path to save logging information during the run dictionaries: list[int] List of dictionary ids feature_strings: dict[int, str] Mapping of dictionaries to feature generation strings - model_path: str + model_path: :class:`~pathlib.Path` Path to model file fmllr_options: dict[str, Any] fMLLR options - tmp_lat_paths: dict[int, str] + tmp_lat_paths: dict[int, Path] Per dictionary temporary lattice paths - final_lat_paths: dict[int, str] + final_lat_paths: dict[int, Path] Per dictionary lattice paths """ dictionaries: List[int] feature_strings: Dict[int, str] - model_path: str + model_path: Path fmllr_options: MetaDict - tmp_lat_paths: Dict[int, str] - final_lat_paths: Dict[int, str] + tmp_lat_paths: Dict[int, Path] + final_lat_paths: Dict[int, Path] -def compose_lg(dictionary_path: str, small_g_path: str, lg_path: str, log_file: TextIO) -> None: +def compose_lg(dictionary_path: Path, small_g_path: Path, lg_path: Path, log_file: TextIO) -> None: """ Compose an LG.fst @@ -410,11 +402,11 @@ def compose_lg(dictionary_path: str, small_g_path: str, lg_path: str, log_file: Parameters ---------- - dictionary_path: str + dictionary_path: :class:`~pathlib.Path` Path to a lexicon fst file - small_g_path: str + small_g_path: :class:`~pathlib.Path` Path to the small language model's G.fst - lg_path: str + lg_path: :class:`~pathlib.Path` Output path to LG.fst log_file: TextIO Log file handler to output logging info to @@ -457,13 +449,13 @@ def compose_lg(dictionary_path: str, small_g_path: str, lg_path: str, log_file: def compose_clg( - in_disambig: typing.Optional[str], - out_disambig: typing.Optional[str], + in_disambig: typing.Optional[Path], + out_disambig: typing.Optional[Path], context_width: int, central_pos: int, - ilabels_temp: str, - lg_path: str, - clg_path: str, + ilabels_temp: Path, + lg_path: Path, + clg_path: Path, log_file: TextIO, ) -> None: """ @@ -478,19 +470,19 @@ def compose_clg( Parameters ---------- - in_disambig: str + in_disambig: :class:`~pathlib.Path` Path to read disambiguation symbols file - out_disambig: str + out_disambig: :class:`~pathlib.Path` Path to write disambiguation symbols file context_width: int Context width of the acoustic model central_pos: int Central position of the acoustic model - ilabels_temp: + ilabels_temp: :class:`~pathlib.Path` Temporary file for ilabels - lg_path: str + lg_path: :class:`~pathlib.Path` Path to a LG.fst file - clg_path: + clg_path: :class:`~pathlib.Path` Path to save CLG.fst file log_file: TextIO Log file handler to output logging info to @@ -520,11 +512,11 @@ def compose_clg( def compose_hclg( - model_path: str, - ilabels_temp: str, + model_path: Path, + ilabels_temp: Path, transition_scale: float, - clg_path: str, - hclga_path: str, + clg_path: Path, + hclga_path: Path, log_file: TextIO, ) -> None: """ @@ -549,22 +541,22 @@ def compose_hclg( Parameters ---------- - model_path: str + model_path: :class:`~pathlib.Path` Path to acoustic model - ilabels_temp: str + ilabels_temp: :class:`~pathlib.Path` Path to temporary ilabels file transition_scale: float Transition scale for the fst - clg_path: str + clg_path: :class:`~pathlib.Path` Path to CLG.fst file - hclga_path: str + hclga_path: :class:`~pathlib.Path` Path to save HCLGa.fst file log_file: TextIO Log file handler to output logging info to """ - tree_path = model_path.replace("final.mdl", "tree") - ha_path = hclga_path.replace("HCLGa", "Ha") - ha_out_disambig = hclga_path.replace("HCLGa", "disambig_tid") + tree_path = model_path.with_name("tree") + ha_path = hclga_path.with_stem("Ha" + hclga_path.stem.split("_")[-1]) + ha_out_disambig = hclga_path.with_stem("disambig_tid" + hclga_path.stem.split("_")[-1]) make_h_proc = subprocess.Popen( [ thirdparty_binary("make-h-transducer"), @@ -618,7 +610,7 @@ def compose_hclg( minimize_proc.communicate() -def compose_g(arpa_path: str, words_path: str, g_path: str, log_file: TextIO) -> None: +def compose_g(arpa_path: Path, words_path: Path, g_path: Path, log_file: TextIO) -> None: """ Create G.fst from an ARPA formatted language model @@ -629,11 +621,11 @@ def compose_g(arpa_path: str, words_path: str, g_path: str, log_file: TextIO) -> Parameters ---------- - arpa_path: str + arpa_path: :class:`~pathlib.Path` Path to ARPA file - words_path: str + words_path: :class:`~pathlib.Path` Path to words symbols file - g_path: str + g_path: :class:`~pathlib.Path` Path to output G.fst file log_file: TextIO Log file handler to output logging info to @@ -653,10 +645,10 @@ def compose_g(arpa_path: str, words_path: str, g_path: str, log_file: TextIO) -> def compose_g_carpa( - in_carpa_path: str, - temp_carpa_path: str, + in_carpa_path: Path, + temp_carpa_path: Path, words_mapping: Dict[str, int], - carpa_path: str, + carpa_path: Path, log_file: TextIO, ): """ @@ -669,13 +661,13 @@ def compose_g_carpa( Parameters ---------- - in_carpa_path: str + in_carpa_path: :class:`~pathlib.Path` Input ARPA model path - temp_carpa_path: str + temp_carpa_path: :class:`~pathlib.Path` Temporary CARPA model path words_mapping: dict[str, int] Words symbols mapping - carpa_path: str + carpa_path: :class:`~pathlib.Path` Path to save output G.carpa log_file: TextIO Log file handler to output logging info to @@ -764,7 +756,6 @@ class CreateHclgFunction(KaldiFunction): def __init__(self, args: CreateHclgArguments): super().__init__(args) self.working_directory = args.working_directory - self.path_template = args.path_template self.words_path = args.words_path self.carpa_path = args.carpa_path self.small_arpa_path = args.small_arpa_path @@ -778,24 +769,24 @@ def __init__(self, args: CreateHclgArguments): def _run(self) -> typing.Generator[typing.Tuple[bool, str]]: """Run the function""" - hclg_path = self.path_template.format(file_name="HCLG") - small_g_path = self.path_template.format(file_name="G.small") - medium_g_path = self.path_template.format(file_name="G.med") - lg_path = self.path_template.format(file_name="LG") - hclga_path = self.path_template.format(file_name="HCLGa") + hclg_path = self.working_directory.joinpath(f"HCLG.{self.job_name}.fst") + small_g_path = hclg_path.with_stem(f"G_small.{self.job_name}") + medium_g_path = hclg_path.with_stem(f"G_med.{self.job_name}") + lg_path = hclg_path.with_stem(f"LG.{self.job_name}") + hclga_path = hclg_path.with_stem(f"HCLGa.{self.job_name}") if os.path.exists(hclg_path): return with mfa_open(self.log_path, "w") as log_file: context_width = self.hclg_options["context_width"] central_pos = self.hclg_options["central_pos"] - clg_path = self.path_template.format(file_name=f"CLG_{context_width}_{central_pos}") - ilabels_temp = self.path_template.format( - file_name=f"ilabels_{context_width}_{central_pos}" - ).replace(".fst", "") - out_disambig = self.path_template.format( - file_name=f"disambig_ilabels_{context_width}_{central_pos}" - ).replace(".fst", ".int") + clg_path = hclg_path.with_stem(f"CLG_{context_width}_{central_pos}.{self.job_name}") + ilabels_temp = hclg_path.with_name( + f"ilabels_{context_width}_{central_pos}.{self.job_name}" + ) + out_disambig = hclg_path.with_name( + f"disambig_ilabels_{context_width}_{central_pos}_{self.job_name}.int" + ) log_file.write("Generating decoding graph...\n") if not os.path.exists(small_g_path): @@ -808,7 +799,7 @@ def _run(self) -> typing.Generator[typing.Tuple[bool, str]]: yield 1 if not os.path.exists(self.carpa_path): log_file.write("Generating G.carpa...") - temp_carpa_path = self.carpa_path + ".temp" + temp_carpa_path = self.carpa_path.with_suffix(".temp") compose_g_carpa( self.big_arpa_path, temp_carpa_path, @@ -872,7 +863,7 @@ def _run(self) -> typing.Generator[typing.Tuple[bool, str]]: ) convert_proc.communicate() self.check_call(convert_proc) - if os.path.exists(hclg_path): + if hclg_path.exists(): yield True, hclg_path else: yield False, hclg_path @@ -1005,8 +996,6 @@ def _run(self) -> typing.Generator[typing.Tuple[int, int]]: rescored_lat_path = self.rescored_lat_paths[dict_id] old_g_path = self.old_g_paths[dict_id] new_g_path = self.new_g_paths[dict_id] - if " " in new_g_path: - new_g_path = f'"{new_g_path}"' project_type_arg = "--project_type=output" if os.path.exists(rescored_lat_path): continue @@ -1022,7 +1011,7 @@ def _run(self) -> typing.Generator[typing.Tuple[int, int]]: thirdparty_binary("lattice-lmrescore-pruned"), f"--acoustic-scale={self.lm_rescore_options['acoustic_scale']}", "-", - f"fstproject {project_type_arg} {new_g_path} |", + f'fstproject {project_type_arg} "{new_g_path}" |', f"ark,s,cs:{lat_path}", f"ark:{rescored_lat_path}", ], @@ -1358,8 +1347,8 @@ def _run(self) -> typing.Generator[int]: for dict_id in self.dictionaries: feature_string = self.feature_strings[dict_id] trans_path = self.trans_paths[dict_id] - temp_trans_path = trans_path + ".temp" - temp_composed_trans_path = trans_path + ".temp_composed" + temp_trans_path = trans_path.with_suffix(".temp") + temp_composed_trans_path = trans_path.with_suffix(".temp_composed") spk2utt_path = self.spk2utt_paths[dict_id] tmp_lat_path = self.tmp_lat_paths[dict_id] determinize_proc = subprocess.Popen( @@ -1520,13 +1509,13 @@ def _run(self) -> typing.Generator[typing.Tuple[int, int]]: class PerSpeakerDecodeArguments(MfaArguments): """Arguments for :class:`~montreal_forced_aligner.validation.corpus_validator.PerSpeakerDecodeFunction`""" - model_directory: str + model_directory: Path feature_strings: Dict[int, str] - lat_paths: Dict[int, str] - model_path: str - disambiguation_symbols_int_path: str + lat_paths: Dict[int, Path] + model_path: Path + disambiguation_symbols_int_path: Path decode_options: MetaDict - tree_path: str + tree_path: Path order: int method: str diff --git a/montreal_forced_aligner/transcription/transcriber.py b/montreal_forced_aligner/transcription/transcriber.py index 1316f016..ef15902c 100644 --- a/montreal_forced_aligner/transcription/transcriber.py +++ b/montreal_forced_aligner/transcription/transcriber.py @@ -14,6 +14,7 @@ import subprocess import time import typing +from pathlib import Path from queue import Empty from typing import TYPE_CHECKING, Dict, List, Optional, Tuple @@ -187,7 +188,7 @@ def train_speaker_lm_arguments( TrainSpeakerLmArguments( j.id, getattr(self, "db_string", ""), - os.path.join(self.working_log_directory, f"train_lm.{j.id}.log"), + self.working_log_directory.joinpath(f"train_lm.{j.id}.log"), self.model_path, self.order, self.method, @@ -244,14 +245,14 @@ def train_speaker_lms(self) -> None: logger.debug(f"Compiling speaker language models took {time.time() - begin:.3f} seconds") @property - def model_directory(self) -> str: + def model_directory(self) -> Path: """Model directory for the transcriber""" - return os.path.join(self.output_directory, "models") + return self.output_directory.joinpath("models") @property - def model_log_directory(self) -> str: + def model_log_directory(self) -> Path: """Model directory for the transcriber""" - return os.path.join(self.model_directory, "log") + return self.model_directory.joinpath("log") def lm_rescore(self) -> None: """ @@ -377,8 +378,8 @@ def train_phone_lm(self): ngram_order = 4 num_ngrams = 20000 - phone_lm_path = os.path.join(self.phones_dir, "phone_lm.fst") - log_path = os.path.join(self.phones_dir, "phone_lm_training.log") + phone_lm_path = self.phones_dir.joinpath("phone_lm.fst") + log_path = self.phones_dir.joinpath("phone_lm_training.log") unigram_phones = set() return_queue = mp.Queue() stopped = Stopped() @@ -390,14 +391,14 @@ def train_phone_lm(self): total=self.num_current_utterances, disable=GLOBAL_CONFIG.quiet ) as pbar: - with mfa_open(os.path.join(self.phones_dir, "phone_boundaries.int"), "w") as f: + with mfa_open(self.phones_dir.joinpath("phone_boundaries.int"), "w") as f: for p in session.query(Phone): f.write(f"{p.mapping_id} singleton\n") for j in self.jobs: args = TrainLmArguments( j.id, getattr(self, "db_string", ""), - os.path.join(self.working_log_directory, f"ngram_count.{j.id}.log"), + self.working_log_directory.joinpath(f"ngram_count.{j.id}.log"), self.phones_dir, self.phone_symbol_table_path, ngram_order, @@ -407,7 +408,7 @@ def train_phone_lm(self): p = KaldiProcessWorker(j.id, return_queue, function, stopped) procs.append(p) p.start() - count_paths.append(os.path.join(self.phones_dir, f"{j.id}.cnts")) + count_paths.append(self.phones_dir.joinpath(f"{j.id}.cnts")) while True: try: result = return_queue.get(timeout=1) @@ -438,7 +439,7 @@ def train_phone_lm(self): raise v logger.info("Training model...") with mfa_open(log_path, "w") as log_file: - merged_file = os.path.join(self.phones_dir, "merged.cnts") + merged_file = self.phones_dir.joinpath("merged.cnts") if len(count_paths) > 1: ngrammerge_proc = subprocess.Popen( [ @@ -490,7 +491,7 @@ def train_phone_lm(self): log_file.flush() bigram_fst = model.construct_bigram_fst("#1", allowed_bigrams, phone_symbols) - bigram_fst.write(os.path.join(self.phones_dir, "bigram.fst")) + bigram_fst.write(self.phones_dir.joinpath("bigram.fst")) bigram_fst.project("output") push_special_proc = subprocess.Popen( [thirdparty_binary("fstpushspecial")], @@ -528,15 +529,15 @@ def setup_phone_lm(self) -> None: from montreal_forced_aligner.transcription.multiprocessing import compose_clg, compose_hclg self.train_phone_lm() - with mfa_open(os.path.join(self.working_log_directory, "hclg.log"), "w") as log_file: + with mfa_open(self.working_log_directory.joinpath("hclg.log"), "w") as log_file: context_width = self.hclg_options["context_width"] central_pos = self.hclg_options["central_pos"] clg_path = os.path.join( self.working_directory, f"CLG_{context_width}_{central_pos}.fst" ) - hclga_path = os.path.join(self.working_directory, "HCLGa.fst") - hclg_path = os.path.join(self.working_directory, "HCLG_phone.fst") + hclga_path = self.working_directory.joinpath("HCLGa.fst") + hclg_path = self.working_directory.joinpath("HCLG_phone.fst") ilabels_temp = os.path.join( self.working_directory, f"ilabels_{context_width}_{central_pos}" ) @@ -550,7 +551,7 @@ def setup_phone_lm(self) -> None: context_width, central_pos, ilabels_temp, - os.path.join(self.phones_dir, "phone_lm.fst"), + self.phones_dir.joinpath("phone_lm.fst"), clg_path, log_file, ) @@ -693,7 +694,7 @@ def evaluate_transcriptions(self) -> Tuple[float, float]: ser, wer, cer = self.compute_wer() logger.info(f"SER: {100 * ser:.2f}%, WER: {100 * wer:.2f}%, CER: {100 * cer:.2f}%") - def save_transcription_evaluation(self, output_directory: str) -> None: + def save_transcription_evaluation(self, output_directory: Path) -> None: """ Save transcription evaluation to an output directory @@ -702,7 +703,7 @@ def save_transcription_evaluation(self, output_directory: str) -> None: output_directory: str Directory to save evaluation """ - output_path = os.path.join(output_directory, "transcription_evaluation.csv") + output_path = output_directory.joinpath("transcription_evaluation.csv") with mfa_open(output_path, "w") as f, self.session() as session: writer = csv.writer(f) writer.writerow( @@ -966,7 +967,7 @@ def lat_gen_fmllr(self) -> None: workflow = self.current_workflow arguments = self.lat_gen_fmllr_arguments(workflow.workflow_type) with tqdm.tqdm(total=self.num_utterances, disable=GLOBAL_CONFIG.quiet) as pbar, mfa_open( - os.path.join(self.working_log_directory, "lat_gen_fmllr_log_like.csv"), + self.working_log_directory.joinpath("lat_gen_fmllr_log_like.csv"), "w", encoding="utf8", ) as log_file: @@ -1179,7 +1180,7 @@ def decode_arguments( PerSpeakerDecodeArguments( j.id, getattr(self, "db_string", ""), - os.path.join(self.working_log_directory, f"per_speaker_decode.{j.id}.log"), + self.working_log_directory.joinpath(f"per_speaker_decode.{j.id}.log"), self.model_directory, feat_strings, j.construct_path_dictionary(self.working_directory, "lat", "ark"), @@ -1196,14 +1197,14 @@ def decode_arguments( DecodePhoneArguments( j.id, getattr(self, "db_string", ""), - os.path.join(self.working_log_directory, f"decode.{j.id}.log"), + self.working_log_directory.joinpath(f"decode.{j.id}.log"), j.dictionary_ids, feat_strings, self.decode_options, self.alignment_model_path, j.construct_path_dictionary(self.working_directory, "lat", "ark"), self.phone_symbol_table_path, - os.path.join(self.working_directory, "HCLG_phone.fst"), + self.working_directory.joinpath("HCLG_phone.fst"), ) ) else: @@ -1211,7 +1212,7 @@ def decode_arguments( DecodeArguments( j.id, getattr(self, "db_string", ""), - os.path.join(self.working_log_directory, f"decode.{j.id}.log"), + self.working_log_directory.joinpath(f"decode.{j.id}.log"), j.dictionary_ids, feat_strings, self.decode_options, @@ -1240,13 +1241,13 @@ def lm_rescore_arguments(self) -> List[LmRescoreArguments]: LmRescoreArguments( j.id, getattr(self, "db_string", ""), - os.path.join(self.working_log_directory, f"lm_rescore.{j.id}.log"), + self.working_log_directory.joinpath(f"lm_rescore.{j.id}.log"), j.dictionary_ids, self.lm_rescore_options, j.construct_path_dictionary(self.working_directory, "lat", "ark"), j.construct_path_dictionary(self.working_directory, "lat.rescored", "ark"), - j.construct_dictionary_dependent_paths(self.model_directory, "G.small", "fst"), - j.construct_dictionary_dependent_paths(self.model_directory, "G.med", "fst"), + j.construct_dictionary_dependent_paths(self.model_directory, "G_small", "fst"), + j.construct_dictionary_dependent_paths(self.model_directory, "G_med", "fst"), ) for j in self.jobs ] @@ -1264,11 +1265,11 @@ def carpa_lm_rescore_arguments(self) -> List[CarpaLmRescoreArguments]: CarpaLmRescoreArguments( j.id, getattr(self, "db_string", ""), - os.path.join(self.working_log_directory, f"carpa_lm_rescore.{j.id}.log"), + self.working_log_directory.joinpath(f"carpa_lm_rescore.{j.id}.log"), j.dictionary_ids, j.construct_path_dictionary(self.working_directory, "lat.rescored", "ark"), j.construct_path_dictionary(self.working_directory, "lat.carpa.rescored", "ark"), - j.construct_dictionary_dependent_paths(self.model_directory, "G.med", "fst"), + j.construct_dictionary_dependent_paths(self.model_directory, "G_med", "fst"), j.construct_dictionary_dependent_paths(self.model_directory, "G", "carpa"), ) for j in self.jobs @@ -1308,7 +1309,7 @@ def initial_fmllr_arguments(self) -> List[InitialFmllrArguments]: InitialFmllrArguments( j.id, getattr(self, "db_string", ""), - os.path.join(self.working_log_directory, f"initial_fmllr.{j.id}.log"), + self.working_log_directory.joinpath(f"initial_fmllr.{j.id}.log"), j.dictionary_ids, feat_strings, self.model_path, @@ -1350,7 +1351,7 @@ def lat_gen_fmllr_arguments( self.feature_options["uses_speaker_adaptation"], ) else: - hclg_paths = os.path.join(self.working_directory, "HCLG_phone.fst") + hclg_paths = self.working_directory.joinpath("HCLG_phone.fst") word_paths = self.phone_symbol_table_path feat_strings = j.construct_feature_proc_string( @@ -1366,7 +1367,7 @@ def lat_gen_fmllr_arguments( LatGenFmllrArguments( j.id, getattr(self, "db_string", ""), - os.path.join(self.working_log_directory, f"lat_gen_fmllr.{j.id}.log"), + self.working_log_directory.joinpath(f"lat_gen_fmllr.{j.id}.log"), j.dictionary_ids, feat_strings, self.model_path, @@ -1404,7 +1405,7 @@ def final_fmllr_arguments(self) -> List[FinalFmllrArguments]: FinalFmllrArguments( j.id, getattr(self, "db_string", ""), - os.path.join(self.working_log_directory, f"final_fmllr.{j.id}.log"), + self.working_log_directory.joinpath(f"final_fmllr.{j.id}.log"), j.dictionary_ids, feat_strings, self.model_path, @@ -1441,7 +1442,7 @@ def fmllr_rescore_arguments(self) -> List[FmllrRescoreArguments]: FmllrRescoreArguments( j.id, getattr(self, "db_string", ""), - os.path.join(self.working_log_directory, f"fmllr_rescore.{j.id}.log"), + self.working_log_directory.joinpath(f"fmllr_rescore.{j.id}.log"), j.dictionary_ids, feat_strings, self.model_path, @@ -1487,8 +1488,8 @@ class Transcriber(TranscriberMixin, TopLevelMfaWorker): def __init__( self, - acoustic_model_path: str, - language_model_path: str, + acoustic_model_path: Path, + language_model_path: Path, output_type: str = "transcription", **kwargs, ): @@ -1514,11 +1515,10 @@ def create_hclgs_arguments(self) -> Dict[int, CreateHclgArguments]: args[d.id] = CreateHclgArguments( d.id, getattr(self, "db_string", ""), - os.path.join(self.model_directory, "log", f"hclg.{d.id}.log"), + self.model_directory.joinpath("log", f"hclg.{d.id}.log"), self.model_directory, - os.path.join(self.model_directory, f"{{file_name}}.{d.id}.fst"), - os.path.join(self.model_directory, f"words.{d.id}.txt"), - os.path.join(self.model_directory, f"G.{d.id}.carpa"), + self.model_directory.joinpath(f"words.{d.id}.txt"), + self.model_directory.joinpath(f"G.{d.id}.carpa"), self.language_model.small_arpa_path, self.language_model.medium_arpa_path, self.language_model.carpa_path, @@ -1595,7 +1595,7 @@ def create_hclgs(self) -> None: pbar.update(1) error_logs = [] for arg in dict_arguments: - if not os.path.exists(arg.hclg_path): + if not self.model_directory.joinpath(f"HCLG.{arg.job_name}.fst").exists(): error_logs.append(arg.log_path) if error_logs: raise KaldiProcessingError(error_logs) @@ -1629,7 +1629,7 @@ def create_decoding_graph(self) -> None: "Running `mfa train_lm` on the ARPA file will remove this warning." ) logger.info("Parsing large ngram model...") - mod_path = os.path.join(self.model_directory, "base_lm.mod") + mod_path = self.model_directory.joinpath("base_lm.mod") new_carpa_path = os.path.join(self.model_directory, "base_lm.arpa") with mfa_open(big_arpa_path, "r") as inf, mfa_open(new_carpa_path, "w") as outf: for line in inf: @@ -1642,7 +1642,7 @@ def create_decoding_graph(self) -> None: "Generating small model from the large ARPA with a pruning threshold of 3e-7" ) prune_thresh_small = 0.0000003 - small_mod_path = mod_path.replace(".mod", "_small.mod") + small_mod_path = mod_path.with_stem(mod_path.stem + "_small") subprocess.call( [ "ngramshrink", @@ -1659,7 +1659,7 @@ def create_decoding_graph(self) -> None: "Generating medium model from the large ARPA with a pruning threshold of 1e-7" ) prune_thresh_medium = 0.0000001 - med_mod_path = mod_path.replace(".mod", "_med.mod") + med_mod_path = mod_path.with_stem(mod_path.stem + "_med") subprocess.call( [ "ngramshrink", @@ -1684,7 +1684,7 @@ def create_decoding_graph(self) -> None: @classmethod def parse_parameters( cls, - config_path: Optional[str] = None, + config_path: Optional[Path] = None, args: Optional[Dict[str, typing.Any]] = None, unknown_args: Optional[typing.Iterable[str]] = None, ) -> MetaDict: @@ -1693,7 +1693,7 @@ def parse_parameters( Parameters ---------- - config_path: str, optional + config_path: :class:`~pathlib.Path`, optional Path to yaml configuration file args: dict[str, Any] Parsed arguments @@ -1734,7 +1734,7 @@ def setup(self) -> None: begin = time.time() os.makedirs(self.working_log_directory, exist_ok=True) self.load_corpus() - dirty_path = os.path.join(self.working_directory, "dirty") + dirty_path = self.working_directory.joinpath("dirty") if os.path.exists(dirty_path): shutil.rmtree(self.working_directory, ignore_errors=True) os.makedirs(self.working_log_directory, exist_ok=True) @@ -1744,13 +1744,6 @@ def setup(self) -> None: shutil.rmtree(self.model_directory) log_dir = os.path.join(self.model_directory, "log") os.makedirs(log_dir, exist_ok=True) - if self.acoustic_model.meta["version"] < "2.1": - logger.warning( - "The acoustic model was trained in an earlier version of MFA. " - "There may be incompatibilities in feature generation that cause errors. " - "Please download the latest version of the model via `mfa model download`, " - "use a different acoustic model, or use version 2.0.6 of MFA." - ) self.acoustic_model.validate(self) self.acoustic_model.export_model(self.model_directory) self.acoustic_model.export_model(self.working_directory) @@ -1813,7 +1806,7 @@ def export_transcriptions(self) -> None: def export_files( self, - output_directory: str, + output_directory: Path, output_format: Optional[str] = None, include_original_text: bool = False, ) -> None: diff --git a/montreal_forced_aligner/utils.py b/montreal_forced_aligner/utils.py index 7167563a..b3be365f 100644 --- a/montreal_forced_aligner/utils.py +++ b/montreal_forced_aligner/utils.py @@ -14,6 +14,7 @@ import subprocess import time import typing +from pathlib import Path from queue import Empty from typing import Any, Callable, Dict, List, Optional, Tuple, Union @@ -142,7 +143,7 @@ def get_class_for_dataset_type(dataset_type: DatasetType): def parse_dictionary_file( - path: str, + path: Path, ) -> typing.Generator[ typing.Tuple[ str, @@ -158,7 +159,7 @@ def parse_dictionary_file( Parameters ---------- - path: str + path: :class:`~pathlib.Path` Path to lexicon file Yields @@ -403,7 +404,7 @@ def read_feats(proc: subprocess.Popen, raw_id=False) -> Dict[str, np.array]: yield current_id, feats -def parse_logs(log_directory: str) -> None: +def parse_logs(log_directory: Path) -> None: """ Parse the output of a Kaldi run for any errors and raise relevant MFA exceptions @@ -419,11 +420,10 @@ def parse_logs(log_directory: str) -> None: """ error_logs = [] - for name in os.listdir(log_directory): - log_path = os.path.join(log_directory, name) - if os.path.isdir(log_path): + for log_path in log_directory.iterdir(): + if log_path.is_dir(): continue - if not name.endswith(".log"): + if log_path.suffix != ".log": continue with mfa_open(log_path, "r") as f: for line in f: diff --git a/montreal_forced_aligner/vad/multiprocessing.py b/montreal_forced_aligner/vad/multiprocessing.py index e82f89a4..2a6a563d 100644 --- a/montreal_forced_aligner/vad/multiprocessing.py +++ b/montreal_forced_aligner/vad/multiprocessing.py @@ -6,6 +6,7 @@ import re import subprocess import typing +from pathlib import Path from typing import TYPE_CHECKING, List, Union from montreal_forced_aligner.abc import KaldiFunction @@ -37,7 +38,7 @@ class SegmentVadArguments(MfaArguments): """Arguments for :class:`~montreal_forced_aligner.segmenter.SegmentVadFunction`""" - vad_path: str + vad_path: Path segmentation_options: MetaDict diff --git a/montreal_forced_aligner/vad/segmenter.py b/montreal_forced_aligner/vad/segmenter.py index e208e286..b5d93d75 100644 --- a/montreal_forced_aligner/vad/segmenter.py +++ b/montreal_forced_aligner/vad/segmenter.py @@ -9,6 +9,7 @@ import os import sys import typing +from pathlib import Path from typing import Dict, List, Optional import sqlalchemy @@ -137,7 +138,7 @@ def __init__( @classmethod def parse_parameters( cls, - config_path: Optional[str] = None, + config_path: Optional[Path] = None, args: Optional[Dict[str, typing.Any]] = None, unknown_args: Optional[typing.Iterable[str]] = None, ) -> MetaDict: @@ -146,7 +147,7 @@ def parse_parameters( Parameters ---------- - config_path: str + config_path: :class:`~pathlib.Path` Config path args: dict[str, Any] Parsed arguments @@ -187,7 +188,7 @@ def segment_vad_arguments(self) -> List[SegmentVadArguments]: SegmentVadArguments( j.id, getattr(self, "db_string", ""), - os.path.join(self.working_log_directory, f"segment_vad.{j.id}.log"), + self.working_log_directory.joinpath(f"segment_vad.{j.id}.log"), j.construct_path(self.split_directory, "vad", "scp"), self.segmentation_options, ) @@ -330,7 +331,7 @@ def setup(self) -> None: """Setup segmentation""" super().setup() self.create_new_current_workflow(WorkflowType.segmentation) - log_dir = os.path.join(self.working_directory, "log") + log_dir = self.working_directory.joinpath("log") os.makedirs(log_dir, exist_ok=True) try: if self.speechbrain: diff --git a/montreal_forced_aligner/validation/corpus_validator.py b/montreal_forced_aligner/validation/corpus_validator.py index 636cd164..afb17002 100644 --- a/montreal_forced_aligner/validation/corpus_validator.py +++ b/montreal_forced_aligner/validation/corpus_validator.py @@ -9,6 +9,7 @@ import time import typing from decimal import Decimal +from pathlib import Path from typing import TYPE_CHECKING, Any, Dict, Optional import sqlalchemy @@ -84,7 +85,7 @@ def __init__( @property def working_log_directory(self) -> str: """Working log directory""" - return os.path.join(self.working_directory, "log") + return self.working_directory.joinpath("log") def analyze_setup(self) -> None: """ @@ -531,7 +532,7 @@ def __init__(self, **kwargs): @classmethod def parse_parameters( cls, - config_path: Optional[str] = None, + config_path: Optional[Path] = None, args: Optional[Dict[str, Any]] = None, unknown_args: Optional[typing.Iterable[str]] = None, ) -> MetaDict: @@ -541,7 +542,7 @@ def parse_parameters( Parameters ---------- - config_path: str + config_path: :class:`~pathlib.Path` Config path args: dict[str, Any] Parsed arguments diff --git a/montreal_forced_aligner/validation/dictionary_validator.py b/montreal_forced_aligner/validation/dictionary_validator.py index 525a380a..e6b0e646 100644 --- a/montreal_forced_aligner/validation/dictionary_validator.py +++ b/montreal_forced_aligner/validation/dictionary_validator.py @@ -1,8 +1,8 @@ """Classes for validating dictionaries""" import logging -import os import shutil import typing +from pathlib import Path from montreal_forced_aligner.config import GLOBAL_CONFIG from montreal_forced_aligner.data import WorkflowType @@ -18,7 +18,7 @@ class DictionaryValidator(PyniniTrainer): Parameters ---------- - g2p_model_path: str, optional + g2p_model_path: :class:`~pathlib.Path`, optional Path to pretrained G2P model g2p_threshold: float, optional Threshold for pruning pronunciations, defaults to 1.5, which returns the optimal pronunciations and those with scores less than 1.5 times @@ -38,7 +38,7 @@ class DictionaryValidator(PyniniTrainer): def __init__( self, - g2p_model_path: typing.Optional[str] = None, + g2p_model_path: typing.Optional[Path] = None, g2p_threshold: float = 1.5, **kwargs, ): @@ -59,7 +59,7 @@ def setup(self) -> None: logger.info("Not using a pretrained G2P model, training from the dictionary...") self.initialize_training() self.train() - self.g2p_model_path = os.path.join(self.working_log_directory, "g2p_model.zip") + self.g2p_model_path = self.working_log_directory.joinpath("g2p_model.zip") self.export_model(self.g2p_model_path) self.create_new_current_workflow(WorkflowType.g2p) else: @@ -67,13 +67,13 @@ def setup(self) -> None: self.initialize_training() self.initialized = True - def validate(self, output_path: typing.Optional[str] = None) -> None: + def validate(self, output_path: typing.Optional[Path] = None) -> None: """ Validate the dictionary Parameters ---------- - output_path: str, optional + output_path: :class:`~pathlib.Path`, optional Path to save scored CSV """ self.setup() @@ -81,7 +81,7 @@ def validate(self, output_path: typing.Optional[str] = None) -> None: gen = PyniniValidator( g2p_model_path=self.g2p_model_path, word_list=list(self.g2p_training_dictionary.keys()), - temporary_directory=os.path.join(self.working_directory, "validation"), + temporary_directory=self.working_directory.joinpath("validation"), num_jobs=GLOBAL_CONFIG.num_jobs, num_pronunciations=self.num_pronunciations, ) diff --git a/tests/conftest.py b/tests/conftest.py index 72b4c710..2a52e0b8 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -20,51 +20,55 @@ def mock_settings_env_vars(): @pytest.fixture(scope="session") def test_dir(): - base = os.path.dirname(os.path.abspath(__file__)) - return os.path.join(base, "data") + base = pathlib.Path(__file__).parent + return base.joinpath("data") @pytest.fixture(scope="session") def wav_dir(test_dir): - return os.path.join(test_dir, "wav") + return test_dir.joinpath("wav") @pytest.fixture(scope="session") def mp3_test_path(wav_dir): - return os.path.join(wav_dir, "dummy.mp3") + return wav_dir.joinpath("dummy.mp3") @pytest.fixture(scope="session") def opus_test_path(wav_dir): - return os.path.join(wav_dir, "13697_11991_000000.opus") + return wav_dir.joinpath("13697_11991_000000.opus") @pytest.fixture(scope="session") def lab_dir(test_dir): - return os.path.join(test_dir, "lab") + return test_dir.joinpath("lab") @pytest.fixture(scope="session") def textgrid_dir(test_dir): - return os.path.join(test_dir, "textgrid") + return test_dir.joinpath("textgrid") @pytest.fixture(scope="session") def acoustic_model_dir(test_dir): - return os.path.join(test_dir, "am") + return test_dir.joinpath("am") + + +@pytest.fixture(scope="session") +def tokenizer_model_dir(test_dir): + return test_dir.joinpath("tokenizer") @pytest.fixture(scope="session") def language_model_dir(test_dir): - return os.path.join(test_dir, "lm") + return test_dir.joinpath("lm") @pytest.fixture(scope="session") def generated_dir(test_dir): - generated = os.path.join(test_dir, "generated") + generated = test_dir.joinpath("generated") shutil.rmtree(generated, ignore_errors=True) - if not os.path.exists(generated): - os.makedirs(generated) + generated.mkdir(parents=True, exist_ok=True) return generated @@ -85,8 +89,8 @@ def global_config(): @pytest.fixture(scope="session") def temp_dir(generated_dir, global_config): - temp_dir = os.path.join(generated_dir, "temp") - global_config.current_profile.temporary_directory = pathlib.Path(temp_dir) + temp_dir = generated_dir.joinpath("temp") + global_config.current_profile.temporary_directory = temp_dir global_config.save() yield temp_dir @@ -163,7 +167,7 @@ def english_us_mfa_dictionary(model_manager): def english_us_mfa_dictionary_subset(english_us_mfa_dictionary, generated_dir): from montreal_forced_aligner.models import DictionaryModel - path = os.path.join(generated_dir, "subset_english_us.dict") + path = generated_dir.joinpath("subset_english_us.dict") if not os.path.exists(path): model = DictionaryModel(english_us_mfa_dictionary) with mfa_open(model.path, "r") as inf, mfa_open(path, "w") as outf: @@ -237,6 +241,13 @@ def english_g2p_model(model_manager): return "english_us_arpa" +@pytest.fixture(scope="session") +def japanese_tokenizer_model(model_manager): + if not model_manager.has_local_model("tokenizer", "japanese_mfa"): + model_manager.download_model("tokenizer", "japanese_mfa") + return "japanese_mfa" + + @pytest.fixture(scope="session") def english_us_mfa_g2p_model(model_manager): if not model_manager.has_local_model("g2p", "english_us_mfa"): @@ -246,62 +257,60 @@ def english_us_mfa_g2p_model(model_manager): @pytest.fixture(scope="session") def transcription_acoustic_model(acoustic_model_dir): - return os.path.join(acoustic_model_dir, "mono_model.zip") + return acoustic_model_dir.joinpath("mono_model.zip") + + +@pytest.fixture(scope="session") +def test_tokenizer_model(tokenizer_model_dir): + return tokenizer_model_dir.joinpath("test_tokenizer_model.zip") @pytest.fixture(scope="session") def transcription_language_model(language_model_dir, generated_dir): - return os.path.join(language_model_dir, "test_lm.zip") + return language_model_dir.joinpath("test_lm.zip") @pytest.fixture(scope="session") def transcription_language_model_arpa(language_model_dir, generated_dir): - return os.path.join(language_model_dir, "test_lm.arpa") + return language_model_dir.joinpath("test_lm.arpa") @pytest.fixture(scope="session") def corpus_root_dir(generated_dir): - return os.path.join(generated_dir, "constructed_test_corpora") + return generated_dir.joinpath("constructed_test_corpora") @pytest.fixture(scope="session") def output_model_dir(generated_dir): - return os.path.join(generated_dir, "output_models") + return generated_dir.joinpath("output_models") @pytest.fixture(scope="session") def mono_align_model_path(output_model_dir): - return os.path.join(output_model_dir, "mono_model.zip") + return output_model_dir.joinpath("mono_model.zip") @pytest.fixture() def basic_corpus_dir(corpus_root_dir, wav_dir, lab_dir): - path = os.path.join(corpus_root_dir, "test_basic") - os.makedirs(path, exist_ok=True) + path = corpus_root_dir.joinpath("test_basic") + path.mkdir(parents=True, exist_ok=True) names = [("michael", ["acoustic_corpus"]), ("sickmichael", ["cold_corpus", "cold_corpus3"])] for s, files in names: - s_dir = os.path.join(path, s) - os.makedirs(s_dir, exist_ok=True) + s_dir = path.joinpath(s) + s_dir.mkdir(exist_ok=True) for name in files: space_name = name.replace("_", " ") - shutil.copyfile( - os.path.join(wav_dir, name + ".wav"), os.path.join(s_dir, name + ".wav") - ) - shutil.copyfile( - os.path.join(wav_dir, name + ".wav"), os.path.join(s_dir, space_name + ".wav") - ) - shutil.copyfile( - os.path.join(lab_dir, name + ".lab"), os.path.join(s_dir, name + ".lab") - ) - shutil.copyfile( - os.path.join(lab_dir, name + ".lab"), os.path.join(s_dir, space_name + ".lab") - ) + shutil.copyfile(wav_dir.joinpath(name + ".wav"), s_dir.joinpath(name + ".wav")) + shutil.copyfile(wav_dir.joinpath(name + ".wav"), s_dir.joinpath(space_name + ".wav")) + shutil.copyfile(lab_dir.joinpath(name + ".lab"), s_dir.joinpath(name + ".lab")) + shutil.copyfile(lab_dir.joinpath(name + ".lab"), s_dir.joinpath(space_name + ".lab")) return path @pytest.fixture() def combined_corpus_dir(corpus_root_dir, wav_dir, lab_dir): - path = os.path.join(corpus_root_dir, "test_combined") + path = corpus_root_dir.joinpath("test_combined") + path.mkdir(parents=True, exist_ok=True) os.makedirs(path, exist_ok=True) names = [ ("michael", ["acoustic_corpus.wav"]), @@ -336,71 +345,65 @@ def combined_corpus_dir(corpus_root_dir, wav_dir, lab_dir): ), ] for s, files in names: - s_dir = os.path.join(path, s) + s_dir = path.joinpath(s) os.makedirs(s_dir, exist_ok=True) for name in files: - shutil.copyfile(os.path.join(wav_dir, name), os.path.join(s_dir, name)) + shutil.copyfile(wav_dir.joinpath(name), s_dir.joinpath(name)) text_name = name.split(".")[0] + ".lab" - if not os.path.exists(os.path.join(lab_dir, text_name)): + if not lab_dir.joinpath(text_name).exists(): text_name = name.split(".")[0] + ".txt" - shutil.copyfile(os.path.join(lab_dir, text_name), os.path.join(s_dir, text_name)) + shutil.copyfile(lab_dir.joinpath(text_name), s_dir.joinpath(text_name)) return path @pytest.fixture() def duplicated_name_corpus_dir(corpus_root_dir, wav_dir, lab_dir): - path = os.path.join(corpus_root_dir, "test_duplicated") + path = corpus_root_dir.joinpath("test_duplicated") os.makedirs(path, exist_ok=True) names = [("michael", ["acoustic_corpus"]), ("sickmichael", ["cold_corpus", "cold_corpus3"])] for s, files in names: - s_dir = os.path.join(path, s) + s_dir = path.joinpath(s) os.makedirs(s_dir, exist_ok=True) for i, name in enumerate(files): new_name = f"recording_{i}" - shutil.copyfile( - os.path.join(wav_dir, name + ".wav"), os.path.join(s_dir, new_name + ".wav") - ) - shutil.copyfile( - os.path.join(lab_dir, name + ".lab"), os.path.join(s_dir, new_name + ".lab") - ) + shutil.copyfile(wav_dir.joinpath(name + ".wav"), s_dir.joinpath(new_name + ".wav")) + shutil.copyfile(lab_dir.joinpath(name + ".lab"), s_dir.joinpath(new_name + ".lab")) return path @pytest.fixture(scope="session") def basic_reference_dir(corpus_root_dir, wav_dir, textgrid_dir): - path = os.path.join(corpus_root_dir, "test_basic_reference") + path = corpus_root_dir.joinpath("test_basic_reference") os.makedirs(path, exist_ok=True) names = [("michael", ["acoustic_corpus"]), ("sickmichael", ["cold_corpus", "cold_corpus3"])] for s, files in names: - s_dir = os.path.join(path, s) + s_dir = path.joinpath(s) os.makedirs(s_dir, exist_ok=True) for name in files: shutil.copyfile( - os.path.join(textgrid_dir, name + ".TextGrid"), - os.path.join(s_dir, name + ".TextGrid"), + textgrid_dir.joinpath(name + ".TextGrid"), + s_dir.joinpath(name + ".TextGrid"), ) return path @pytest.fixture() def xsampa_corpus_dir(corpus_root_dir, wav_dir, lab_dir): - path = os.path.join(corpus_root_dir, "test_xsampa") + path = corpus_root_dir.joinpath("test_xsampa") os.makedirs(path, exist_ok=True) - s_dir = os.path.join(path, "michael") + s_dir = path.joinpath("michael") os.makedirs(s_dir, exist_ok=True) - shutil.copyfile( - os.path.join(wav_dir, "acoustic_corpus.wav"), os.path.join(s_dir, "xsampa.wav") - ) - shutil.copyfile(os.path.join(lab_dir, "xsampa.lab"), os.path.join(s_dir, "xsampa.lab")) + shutil.copyfile(wav_dir.joinpath("acoustic_corpus.wav"), s_dir.joinpath("xsampa.wav")) + shutil.copyfile(lab_dir.joinpath("xsampa.lab"), s_dir.joinpath("xsampa.lab")) return path @pytest.fixture() def basic_split_dir(corpus_root_dir, wav_dir, lab_dir, textgrid_dir): - path = os.path.join(corpus_root_dir, "test_split") - audio_path = os.path.join(path, "audio") - text_path = os.path.join(path, "text") + path = corpus_root_dir.joinpath("test_split") + audio_path = path.joinpath("audio") + text_path = path.joinpath("text") os.makedirs(path, exist_ok=True) names = [ ("michael", ["acoustic_corpus"]), @@ -427,24 +430,24 @@ def basic_split_dir(corpus_root_dir, wav_dir, lab_dir, textgrid_dir): ), ] for s, files in names: - s_text_dir = os.path.join(text_path, s) - s_audio_dir = os.path.join(audio_path, s) + s_text_dir = text_path.joinpath(s) + s_audio_dir = audio_path.joinpath(s) os.makedirs(s_text_dir, exist_ok=True) os.makedirs(s_audio_dir, exist_ok=True) for name in files: - wav_path = os.path.join(wav_dir, name + ".wav") + wav_path = wav_dir.joinpath(name + ".wav") if os.path.exists(wav_path): - shutil.copyfile(wav_path, wav_path.replace(wav_dir, s_audio_dir)) - lab_path = os.path.join(lab_dir, name + ".lab") + shutil.copyfile(wav_path, s_audio_dir.joinpath(name + ".wav")) + lab_path = lab_dir.joinpath(name + ".lab") if not os.path.exists(lab_path): - lab_path = os.path.join(lab_dir, name + ".txt") - shutil.copyfile(lab_path, lab_path.replace(lab_dir, s_text_dir)) + lab_path = lab_dir.joinpath(name + ".txt") + shutil.copyfile(lab_path, s_text_dir.joinpath(lab_path.name)) return audio_path, text_path @pytest.fixture() def multilingual_ipa_corpus_dir(corpus_root_dir, wav_dir, lab_dir): - path = os.path.join(corpus_root_dir, "test_multilingual") + path = corpus_root_dir.joinpath("test_multilingual") os.makedirs(path, exist_ok=True) names = [ ( @@ -469,21 +472,17 @@ def multilingual_ipa_corpus_dir(corpus_root_dir, wav_dir, lab_dir): ), ] for s, files in names: - s_dir = os.path.join(path, s) + s_dir = path.joinpath(s) os.makedirs(s_dir, exist_ok=True) for name in files: - shutil.copyfile( - os.path.join(wav_dir, name + ".flac"), os.path.join(s_dir, name + ".flac") - ) - shutil.copyfile( - os.path.join(lab_dir, name + ".txt"), os.path.join(s_dir, name + ".txt") - ) + shutil.copyfile(wav_dir.joinpath(name + ".flac"), s_dir.joinpath(name + ".flac")) + shutil.copyfile(lab_dir.joinpath(name + ".txt"), s_dir.joinpath(name + ".txt")) return path @pytest.fixture() def multilingual_ipa_tg_corpus_dir(corpus_root_dir, wav_dir, textgrid_dir): - path = os.path.join(corpus_root_dir, "test_multilingual_tg") + path = corpus_root_dir.joinpath("test_multilingual_tg") os.makedirs(path, exist_ok=True) names = [ ( @@ -508,78 +507,70 @@ def multilingual_ipa_tg_corpus_dir(corpus_root_dir, wav_dir, textgrid_dir): ), ] for s, files in names: - s_dir = os.path.join(path, s) + s_dir = path.joinpath(s) os.makedirs(s_dir, exist_ok=True) for name in files: + shutil.copyfile(wav_dir.joinpath(name + ".flac"), s_dir.joinpath(name + ".flac")) shutil.copyfile( - os.path.join(wav_dir, name + ".flac"), os.path.join(s_dir, name + ".flac") - ) - shutil.copyfile( - os.path.join(textgrid_dir, name + ".TextGrid"), - os.path.join(s_dir, name + ".TextGrid"), + textgrid_dir.joinpath(name + ".TextGrid"), + s_dir.joinpath(name + ".TextGrid"), ) return path @pytest.fixture() def weird_words_dir(corpus_root_dir, wav_dir, lab_dir): - path = os.path.join(corpus_root_dir, "test_weird_words") + path = corpus_root_dir.joinpath("test_weird_words") os.makedirs(path, exist_ok=True) name = "weird_words" - shutil.copyfile( - os.path.join(wav_dir, "acoustic_corpus.wav"), os.path.join(path, name + ".wav") - ) - shutil.copyfile(os.path.join(lab_dir, name + ".lab"), os.path.join(path, name + ".lab")) + shutil.copyfile(wav_dir.joinpath("acoustic_corpus.wav"), path.joinpath(name + ".wav")) + shutil.copyfile(lab_dir.joinpath(name + ".lab"), path.joinpath(name + ".lab")) return path @pytest.fixture() def punctuated_dir(corpus_root_dir, wav_dir, lab_dir): - path = os.path.join(corpus_root_dir, "test_punctuated") + path = corpus_root_dir.joinpath("test_punctuated") os.makedirs(path, exist_ok=True) name = "punctuated" - shutil.copyfile( - os.path.join(wav_dir, "acoustic_corpus.wav"), os.path.join(path, name + ".wav") - ) - shutil.copyfile(os.path.join(lab_dir, name + ".lab"), os.path.join(path, name + ".lab")) + shutil.copyfile(wav_dir.joinpath("acoustic_corpus.wav"), path.joinpath(name + ".wav")) + shutil.copyfile(lab_dir.joinpath(name + ".lab"), path.joinpath(name + ".lab")) name = "weird_words" - shutil.copyfile( - os.path.join(wav_dir, "acoustic_corpus.wav"), os.path.join(path, name + ".wav") - ) - shutil.copyfile(os.path.join(lab_dir, name + ".lab"), os.path.join(path, name + ".lab")) + shutil.copyfile(wav_dir.joinpath("acoustic_corpus.wav"), path.joinpath(name + ".wav")) + shutil.copyfile(lab_dir.joinpath(name + ".lab"), path.joinpath(name + ".lab")) return path @pytest.fixture() def japanese_dir(corpus_root_dir, wav_dir, lab_dir): - path = os.path.join(corpus_root_dir, "test_japanese") + path = corpus_root_dir.joinpath("test_japanese") os.makedirs(path, exist_ok=True) name = "japanese" - shutil.copyfile(os.path.join(lab_dir, name + ".lab"), os.path.join(path, name + ".lab")) + shutil.copyfile(lab_dir.joinpath(name + ".lab"), path.joinpath(name + ".lab")) return path @pytest.fixture() def devanagari_dir(corpus_root_dir, wav_dir, lab_dir): - path = os.path.join(corpus_root_dir, "test_devanagari") + path = corpus_root_dir.joinpath("test_devanagari") os.makedirs(path, exist_ok=True) name = "devanagari" - shutil.copyfile(os.path.join(lab_dir, name + ".lab"), os.path.join(path, name + ".lab")) + shutil.copyfile(lab_dir.joinpath(name + ".lab"), path.joinpath(name + ".lab")) return path @pytest.fixture() def french_clitics_dir(corpus_root_dir, wav_dir, lab_dir): - path = os.path.join(corpus_root_dir, "test_french_clitics") + path = corpus_root_dir.joinpath("test_french_clitics") os.makedirs(path, exist_ok=True) name = "french_clitics" - shutil.copyfile(os.path.join(lab_dir, name + ".lab"), os.path.join(path, name + ".lab")) + shutil.copyfile(lab_dir.joinpath(name + ".lab"), path.joinpath(name + ".lab")) return path @pytest.fixture() def swedish_dir(corpus_root_dir, wav_dir, lab_dir): - path = os.path.join(corpus_root_dir, "test_swedish") + path = corpus_root_dir.joinpath("test_swedish") os.makedirs(path, exist_ok=True) names = [ ( @@ -593,384 +584,379 @@ def swedish_dir(corpus_root_dir, wav_dir, lab_dir): ) ] for s, files in names: - s_dir = os.path.join(path, s) + s_dir = path.joinpath(s) os.makedirs(s_dir, exist_ok=True) for name in files: - shutil.copyfile( - os.path.join(wav_dir, name + ".wav"), os.path.join(s_dir, name + ".wav") - ) - shutil.copyfile( - os.path.join(lab_dir, name + ".lab"), os.path.join(s_dir, name + ".txt") - ) + shutil.copyfile(wav_dir.joinpath(name + ".wav"), s_dir.joinpath(name + ".wav")) + shutil.copyfile(lab_dir.joinpath(name + ".lab"), s_dir.joinpath(name + ".txt")) return path @pytest.fixture() def basic_corpus_txt_dir(corpus_root_dir, wav_dir, lab_dir): - path = os.path.join(corpus_root_dir, "test_basic_txt") + path = corpus_root_dir.joinpath("test_basic_txt") os.makedirs(path, exist_ok=True) names = [("michael", ["acoustic_corpus"]), ("sickmichael", ["cold_corpus", "cold_corpus3"])] for s, files in names: - s_dir = os.path.join(path, s) + s_dir = path.joinpath(s) os.makedirs(s_dir, exist_ok=True) for name in files: - shutil.copyfile( - os.path.join(wav_dir, name + ".wav"), os.path.join(s_dir, name + ".wav") - ) - shutil.copyfile( - os.path.join(lab_dir, name + ".lab"), os.path.join(s_dir, name + ".txt") - ) + shutil.copyfile(wav_dir.joinpath(name + ".wav"), s_dir.joinpath(name + ".wav")) + shutil.copyfile(lab_dir.joinpath(name + ".lab"), s_dir.joinpath(name + ".txt")) return path @pytest.fixture() def extra_corpus_dir(corpus_root_dir, wav_dir, lab_dir): - path = os.path.join(corpus_root_dir, "test_extra") + path = corpus_root_dir.joinpath("test_extra") os.makedirs(path, exist_ok=True) name = "cold_corpus3" - shutil.copyfile(os.path.join(wav_dir, name + ".wav"), os.path.join(path, name + ".wav")) - shutil.copyfile(os.path.join(lab_dir, name + "_extra.lab"), os.path.join(path, name + ".lab")) + shutil.copyfile(wav_dir.joinpath(name + ".wav"), path.joinpath(name + ".wav")) + shutil.copyfile(lab_dir.joinpath(name + "_extra.lab"), path.joinpath(name + ".lab")) return path @pytest.fixture() def transcribe_corpus_24bit_dir(corpus_root_dir, wav_dir): - path = os.path.join(corpus_root_dir, "test_24bit") + path = corpus_root_dir.joinpath("test_24bit") os.makedirs(path, exist_ok=True) name = "cold_corpus_24bit" - shutil.copyfile(os.path.join(wav_dir, name + ".wav"), os.path.join(path, name + ".wav")) + shutil.copyfile(wav_dir.joinpath(name + ".wav"), path.joinpath(name + ".wav")) name = "cold_corpus_32bit_float" - shutil.copyfile(os.path.join(wav_dir, name + ".wav"), os.path.join(path, name + ".wav")) + shutil.copyfile(wav_dir.joinpath(name + ".wav"), path.joinpath(name + ".wav")) return path @pytest.fixture() def stereo_corpus_dir(corpus_root_dir, wav_dir, textgrid_dir): - path = os.path.join(corpus_root_dir, "test_stereo") + path = corpus_root_dir.joinpath("test_stereo") os.makedirs(path, exist_ok=True) name = "michaelandsickmichael" - shutil.copyfile(os.path.join(wav_dir, name + ".wav"), os.path.join(path, name + ".wav")) - shutil.copyfile( - os.path.join(textgrid_dir, name + ".TextGrid"), os.path.join(path, name + ".TextGrid") - ) + shutil.copyfile(wav_dir.joinpath(name + ".wav"), path.joinpath(name + ".wav")) + shutil.copyfile(textgrid_dir.joinpath(name + ".TextGrid"), path.joinpath(name + ".TextGrid")) return path @pytest.fixture() def mp3_corpus_dir(corpus_root_dir, wav_dir, lab_dir): - path = os.path.join(corpus_root_dir, "test_cv_mp3") + path = corpus_root_dir.joinpath("test_cv_mp3") os.makedirs(path, exist_ok=True) names = ["common_voice_en_22058264", "common_voice_en_22058266", "common_voice_en_22058267"] for name in names: - shutil.copyfile(os.path.join(wav_dir, name + ".mp3"), os.path.join(path, name + ".mp3")) - shutil.copyfile(os.path.join(lab_dir, name + ".lab"), os.path.join(path, name + ".lab")) + shutil.copyfile(wav_dir.joinpath(name + ".mp3"), path.joinpath(name + ".mp3")) + shutil.copyfile(lab_dir.joinpath(name + ".lab"), path.joinpath(name + ".lab")) return path @pytest.fixture() def opus_corpus_dir(corpus_root_dir, wav_dir, lab_dir): - path = os.path.join(corpus_root_dir, "test_mls_opus") + path = corpus_root_dir.joinpath("test_mls_opus") os.makedirs(path, exist_ok=True) names = ["13697_11991_000000"] for name in names: - shutil.copyfile(os.path.join(wav_dir, name + ".opus"), os.path.join(path, name + ".opus")) - shutil.copyfile(os.path.join(lab_dir, name + ".lab"), os.path.join(path, name + ".lab")) + shutil.copyfile(wav_dir.joinpath(name + ".opus"), path.joinpath(name + ".opus")) + shutil.copyfile(lab_dir.joinpath(name + ".lab"), path.joinpath(name + ".lab")) return path @pytest.fixture() def stereo_corpus_short_tg_dir(corpus_root_dir, wav_dir, textgrid_dir): - path = os.path.join(corpus_root_dir, "test_stereo_short_tg") + path = corpus_root_dir.joinpath("test_stereo_short_tg") os.makedirs(path, exist_ok=True) name = "michaelandsickmichael" - shutil.copyfile(os.path.join(wav_dir, name + ".wav"), os.path.join(path, name + ".wav")) + shutil.copyfile(wav_dir.joinpath(name + ".wav"), path.joinpath(name + ".wav")) shutil.copyfile( - os.path.join(textgrid_dir, name + "_short_tg.TextGrid"), - os.path.join(path, name + ".TextGrid"), + textgrid_dir.joinpath(name + "_short_tg.TextGrid"), + path.joinpath(name + ".TextGrid"), ) return path @pytest.fixture() def flac_corpus_dir(corpus_root_dir, wav_dir, lab_dir): - path = os.path.join(corpus_root_dir, "test_flac_corpus") + path = corpus_root_dir.joinpath("test_flac_corpus") os.makedirs(path, exist_ok=True) name = "61-70968-0000" - shutil.copyfile(os.path.join(wav_dir, name + ".flac"), os.path.join(path, name + ".flac")) - shutil.copyfile(os.path.join(lab_dir, name + ".lab"), os.path.join(path, name + ".lab")) + shutil.copyfile(wav_dir.joinpath(name + ".flac"), path.joinpath(name + ".flac")) + shutil.copyfile(lab_dir.joinpath(name + ".lab"), path.joinpath(name + ".lab")) return path @pytest.fixture() def flac_tg_corpus_dir(corpus_root_dir, wav_dir, textgrid_dir): - path = os.path.join(corpus_root_dir, "test_flac_tg_corpus") + path = corpus_root_dir.joinpath("test_flac_tg_corpus") os.makedirs(path, exist_ok=True) name = "61-70968-0000" - shutil.copyfile(os.path.join(wav_dir, name + ".flac"), os.path.join(path, name + ".flac")) - shutil.copyfile( - os.path.join(textgrid_dir, name + ".TextGrid"), os.path.join(path, name + ".TextGrid") - ) + shutil.copyfile(wav_dir.joinpath(name + ".flac"), path.joinpath(name + ".flac")) + shutil.copyfile(textgrid_dir.joinpath(name + ".TextGrid"), path.joinpath(name + ".TextGrid")) return path @pytest.fixture() def shortsegments_corpus_dir(corpus_root_dir, wav_dir, textgrid_dir): - path = os.path.join(corpus_root_dir, "test_short_segments") + path = corpus_root_dir.joinpath("test_short_segments") os.makedirs(path, exist_ok=True) name = "short_segments" - shutil.copyfile(os.path.join(wav_dir, "dummy.wav"), os.path.join(path, name + ".wav")) - shutil.copyfile( - os.path.join(textgrid_dir, name + ".TextGrid"), os.path.join(path, name + ".TextGrid") - ) + shutil.copyfile(wav_dir.joinpath("dummy.wav"), path.joinpath(name + ".wav")) + shutil.copyfile(textgrid_dir.joinpath(name + ".TextGrid"), path.joinpath(name + ".TextGrid")) return path @pytest.fixture(scope="session") def dict_dir(test_dir): - return os.path.join(test_dir, "dictionaries") + return test_dir.joinpath("dictionaries") @pytest.fixture(scope="session") def abstract_dict_path(dict_dir): - return os.path.join(dict_dir, "test_abstract.txt") + return dict_dir.joinpath("test_abstract.txt") @pytest.fixture(scope="session") def basic_dict_path(dict_dir): - return os.path.join(dict_dir, "test_basic.txt") + return dict_dir.joinpath("test_basic.txt") @pytest.fixture(scope="session") def tabbed_dict_path(dict_dir): - return os.path.join(dict_dir, "test_tabbed_dictionary.txt") + return dict_dir.joinpath("test_tabbed_dictionary.txt") @pytest.fixture(scope="session") def extra_annotations_path(dict_dir): - return os.path.join(dict_dir, "test_extra_annotations.txt") + return dict_dir.joinpath("test_extra_annotations.txt") @pytest.fixture(scope="session") def frclitics_dict_path(dict_dir): - return os.path.join(dict_dir, "test_frclitics.txt") + return dict_dir.joinpath("test_frclitics.txt") @pytest.fixture(scope="session") def japanese_dict_path(dict_dir): - return os.path.join(dict_dir, "test_japanese.txt") + return dict_dir.joinpath("test_japanese.txt") @pytest.fixture(scope="session") def hindi_dict_path(dict_dir): - return os.path.join(dict_dir, "test_hindi.txt") + return dict_dir.joinpath("test_hindi.txt") @pytest.fixture(scope="session") def xsampa_dict_path(dict_dir): - return os.path.join(dict_dir, "test_xsampa.txt") + return dict_dir.joinpath("test_xsampa.txt") @pytest.fixture(scope="session") def mixed_dict_path(dict_dir): - return os.path.join(dict_dir, "test_mixed_format_dictionary.txt") + return dict_dir.joinpath("test_mixed_format_dictionary.txt") @pytest.fixture(scope="session") def vietnamese_dict_path(dict_dir): - return os.path.join(dict_dir, "test_vietnamese_ipa.txt") + return dict_dir.joinpath("test_vietnamese_ipa.txt") @pytest.fixture(scope="session") def acoustic_dict_path(dict_dir): - return os.path.join(dict_dir, "test_acoustic.txt") + return dict_dir.joinpath("test_acoustic.txt") @pytest.fixture(scope="session") def rules_path(config_directory): - return os.path.join(config_directory, "test_rules.yaml") + return config_directory.joinpath("test_rules.yaml") @pytest.fixture(scope="session") def groups_path(config_directory): - return os.path.join(config_directory, "test_groups.yaml") + return config_directory.joinpath("test_groups.yaml") @pytest.fixture(scope="session") def speaker_dictionary_path(basic_dict_path, acoustic_dict_path, generated_dir): data = {"default": acoustic_dict_path, "sickmichael": basic_dict_path} - speaker_dict_path = os.path.join(generated_dir, "test_basic_acoustic_dicts.yaml") + speaker_dict_path = generated_dir.joinpath("test_basic_acoustic_dicts.yaml") with mfa_open(speaker_dict_path, "w") as f: - yaml.safe_dump(data, f, allow_unicode=True) + yaml.dump(data, f, Dumper=yaml.Dumper, allow_unicode=True) return speaker_dict_path @pytest.fixture(scope="session") def mono_output_directory(generated_dir): - return os.path.join(generated_dir, "mono_output") + return generated_dir.joinpath("mono_output") @pytest.fixture(scope="session") def textgrid_output_model_path(generated_dir): - return os.path.join(generated_dir, "textgrid_output_model.zip") + return generated_dir.joinpath("textgrid_output_model.zip") @pytest.fixture(scope="session") def acoustic_g2p_model_path(generated_dir): - return os.path.join(generated_dir, "acoustic_g2p_output_model.zip") + return generated_dir.joinpath("acoustic_g2p_output_model.zip") @pytest.fixture(scope="session") def ivector_output_model_path(generated_dir): - return os.path.join(generated_dir, "ivector_output_model.zip") + return generated_dir.joinpath("ivector_output_model.zip") @pytest.fixture(scope="session") def basic_g2p_model_path(generated_dir): - return os.path.join(generated_dir, "basic_g2p.zip") + return generated_dir.joinpath("basic_g2p.zip") + + +@pytest.fixture(scope="session") +def basic_tokenizer_model_path(generated_dir): + return generated_dir.joinpath("basic_tokenizer.zip") @pytest.fixture(scope="session") def basic_phonetisaurus_g2p_model_path(generated_dir): - return os.path.join(generated_dir, "basic_phonetisaurus_g2p.zip") + return generated_dir.joinpath("basic_phonetisaurus_g2p.zip") @pytest.fixture(scope="session") def g2p_basic_output(generated_dir): - return os.path.join(generated_dir, "g2p_basic.txt") + return generated_dir.joinpath("g2p_basic.txt") @pytest.fixture(scope="session") def g2p_basic_phonetisaurus_output(generated_dir): - return os.path.join(generated_dir, "phonetisaurus_g2p_basic.txt") + return generated_dir.joinpath("phonetisaurus_g2p_basic.txt") @pytest.fixture(scope="session") def orth_basic_output(generated_dir): - return os.path.join(generated_dir, "orth_basic.txt") + return generated_dir.joinpath("orth_basic.txt") @pytest.fixture(scope="session") def config_directory(test_dir): - return os.path.join(test_dir, "configs") + return test_dir.joinpath("configs") @pytest.fixture(scope="session") def eval_mapping_path(config_directory): - return os.path.join(config_directory, "eval_mapping.yaml") + return config_directory.joinpath("eval_mapping.yaml") @pytest.fixture(scope="session") def basic_train_config_path(config_directory): - return os.path.join(config_directory, "basic_train_config.yaml") + return config_directory.joinpath("basic_train_config.yaml") @pytest.fixture(scope="session") def train_g2p_acoustic_config_path(config_directory): - return os.path.join(config_directory, "train_g2p_acoustic.yaml") + return config_directory.joinpath("train_g2p_acoustic.yaml") @pytest.fixture(scope="session") def transcribe_config_path(config_directory): - return os.path.join(config_directory, "transcribe.yaml") + return config_directory.joinpath("transcribe.yaml") @pytest.fixture(scope="session") def g2p_config_path(config_directory): - return os.path.join(config_directory, "g2p_config.yaml") + return config_directory.joinpath("g2p_config.yaml") @pytest.fixture(scope="session") def train_g2p_config_path(config_directory): - return os.path.join(config_directory, "train_g2p_config.yaml") + return config_directory.joinpath("train_g2p_config.yaml") @pytest.fixture(scope="session") def basic_train_lm_config_path(config_directory): - return os.path.join(config_directory, "basic_train_lm.yaml") + return config_directory.joinpath("basic_train_lm.yaml") @pytest.fixture(scope="session") def different_punctuation_config_path(config_directory): - return os.path.join(config_directory, "different_punctuation_config.yaml") + return config_directory.joinpath("different_punctuation_config.yaml") @pytest.fixture(scope="session") def no_punctuation_config_path(config_directory): - return os.path.join(config_directory, "no_punctuation_config.yaml") + return config_directory.joinpath("no_punctuation_config.yaml") @pytest.fixture(scope="session") def basic_align_config_path(config_directory): - return os.path.join(config_directory, "basic_align_config.yaml") + return config_directory.joinpath("basic_align_config.yaml") @pytest.fixture(scope="session") def basic_segment_config_path(config_directory): - return os.path.join(config_directory, "basic_segment_config.yaml") + return config_directory.joinpath("basic_segment_config.yaml") @pytest.fixture(scope="session") def train_ivector_config_path(config_directory): - return os.path.join(config_directory, "ivector_train.yaml") + return config_directory.joinpath("ivector_train.yaml") @pytest.fixture(scope="session") def mono_align_config_path(config_directory): - return os.path.join(config_directory, "mono_align.yaml") + return config_directory.joinpath("mono_align.yaml") @pytest.fixture(scope="session") def pron_train_config_path(config_directory): - return os.path.join(config_directory, "pron_train.yaml") + return config_directory.joinpath("pron_train.yaml") @pytest.fixture(scope="session") def mono_train_config_path(config_directory): - return os.path.join(config_directory, "mono_train.yaml") + return config_directory.joinpath("mono_train.yaml") @pytest.fixture(scope="session") def xsampa_train_config_path(config_directory): - return os.path.join(config_directory, "xsampa_train.yaml") + return config_directory.joinpath("xsampa_train.yaml") @pytest.fixture(scope="session") def tri_train_config_path(config_directory): - return os.path.join(config_directory, "tri_train.yaml") + return config_directory.joinpath("tri_train.yaml") @pytest.fixture(scope="session") def pitch_train_config_path(config_directory): - return os.path.join(config_directory, "pitch_tri_train.yaml") + return config_directory.joinpath("pitch_tri_train.yaml") @pytest.fixture(scope="session") def lda_train_config_path(config_directory): - return os.path.join(config_directory, "lda_train.yaml") + return config_directory.joinpath("lda_train.yaml") @pytest.fixture(scope="session") def sat_train_config_path(config_directory): - return os.path.join(config_directory, "sat_train.yaml") + return config_directory.joinpath("sat_train.yaml") @pytest.fixture(scope="session") def multispeaker_dictionary_config_path(generated_dir, basic_dict_path, english_dictionary): - path = os.path.join(generated_dir, "multispeaker_dictionary.yaml") + path = generated_dir.joinpath("multispeaker_dictionary.yaml") with mfa_open(path, "w") as f: - yaml.safe_dump( - {"default": english_dictionary, "michael": basic_dict_path}, f, allow_unicode=True + yaml.dump( + {"default": english_dictionary, "michael": basic_dict_path}, + f, + Dumper=yaml.Dumper, + allow_unicode=True, ) return path @pytest.fixture(scope="session") def mfa_speaker_dict_path(generated_dir, english_uk_mfa_dictionary, english_us_mfa_dictionary): - path = os.path.join(generated_dir, "test_multispeaker_mfa_dictionary.yaml") + path = generated_dir.joinpath("test_multispeaker_mfa_dictionary.yaml") with mfa_open(path, "w") as f: - yaml.safe_dump( + yaml.dump( {"default": english_us_mfa_dictionary, "speaker": english_uk_mfa_dictionary}, f, + Dumper=yaml.Dumper, allow_unicode=True, ) return path diff --git a/tests/data/tokenizer/test_tokenizer_model.zip b/tests/data/tokenizer/test_tokenizer_model.zip new file mode 100644 index 0000000000000000000000000000000000000000..b73dbece1284ad61246afd12d5da795b5f283aca GIT binary patch literal 72507 zcmZs?1yEaG^etQp6ez{rEkKJFC{`d8E5+O5PLNU{cyX6PDaExo{E$)tgy3$)t$2c4 z(F7+*keC0PH{Z;g_uZK!H+Rn5?0e4Hd#}Cr%6ko5JnBdPb0|e=8~pE+|MNzM{rqC* z`Ni^!tCO9JgTI}J@caKWnDBoOUR>N!`HUSDi+wTwzk}^PKyIJyob5b?JpG*UyVXd; z-%~43x-Z8`m9PdW!ihP(pHp^xUTWX@GM(jb$86hDN4hz~zl*jzOcVRH_AB`V4ezX` zqg2H|PJ1&u&yPP){XEOB5y07#$Z&}A@=A$TqQkR zY=+2q-tSCE2ZYwO6!S{p~1+d{s`N;u>R9 zKH1~|5oaXseb8+e{~3IWa*4m+3Q4c(XvX-jbLH>d zxldE1|NY?Xd<8v9IH>LO_+;bTFM8){+8cexR{h{?e;p-x$3Enz4t2V<`&~{AN+Pl?T^XLL+$7LU?acky$Ab?4Od?a#E%6vMr3C5 z&8xXh)zGM!J`Aoz=1A1R=~gs-U2zM>f~eEPWBE=E+J7ivW=IpF-^WDS2JSUc0f~HU zVEpiu7GP)E)FkjW;274oyhJ?r>5Um0GW(z+5?5hw!G3FskE*JR-hD(3Z9$sqLSejh zZ9xM0*;JBA9>T1%6dLd|58)ws<^-7&G{tX%d;O%ovI*(#IULd6ADDZ^llBoJo`YIlBx8OCsUUJ%_BJhjElpkCM@(xrkoKF$ z8ppKC1|5$(2x6jKcV+wvFUU_|UHa8!}7ac^aokt?XnAaeg7q)M5cCk~% zJc~%F^YSFaH&nX&*--97(_#C&bA)6ZrOXN5D>+fGb!X)Y7n?3Xqz@X}T3@p&}WxD-5aS&jCLUgtKx0G4)i};Br z&3g<^X~in)br`|q?0#mH(PY}e{)v!a6ot8#ksfWUjHfwCD^0i;OS2Fy-uzo1Qwsz1 zof5A_`l(-R(N{X*5yTOZ)KqU9$xDnor(b0wfLd038zJBy;Ld{#9l}}Twrb7Itn}nS z&VL+VbGEwEgeQtfC*B#{orIs}x)oB*>xA(m+^eGogpn93K=6xe5$amX#hc|imT``I zKlt1Nd@6h~ePTCi@Vw||=Y2(%ky0;at@Lcc8lSQ=M`r)4T)k|t^lh+^ zW%{CZcDK#n)!6xoMnT-;Lh;7F1|n|vChkj>opEIwI+DgQ{Z`BxiD2~vQG9pieAfI9 zsz~BWyEkZhiIe2z&p-Thc|UqP=9UcWW$0t()!Ha)?Ag;Bu1)D&c)TWmp(x$Bye5Tw zL`lEwc%5(!uo2d5wFr@mP+q=vH(x|oeIrtyPeTR$&Dixn;1&K+`>#c?rG*vE*@9|q z<-1q?;-LGUZN%Sm`TCO*M0+FOK56!ZXL7clwB+~~*Zge$V8Gwelb*-wmo{7VL{40+3(Mk(b6Mkb9R$}N3x+sft`&w7KO;tIWeDwkylA4|oqSWr5xXiv-$41=r z_gUa8SybJ?3v#WghB)veX*|2*xr2VgUWCn12}fBd3NjzZ2AZcECT_+9leD|Q$k9mR zkU>GKvV7O#D=P1;$epJ)1Ln-%^LW60BLt!sCE2Z813T$UG!|XgX;=x-g zD>3C3(zkUtb1bpT{3*spyp2TYr#S@+q$!syN0-_ZekvhV;q)2|+q!owTzv;YbuETX z$uaNH-=?I+9LmzY3VTE{uOH^5@-lPlUtsj(m)gDg`ilckBu%q2{uLf!U=+q|x;Y|b zNf+pMFW$Lt!Tgom!v({o7F1m$5HXMmE})V{azb{7aJPYyiD!@!9-xlN*uO4V)wS1T0`rTbus3AT zmujZ!Y^$*AsO+7*=cTLHyI!N<=AeAthroda!HQN3!KwS}suT-eh|pnz>E(n^{rPLF z7V}`<>-?inyCb7t=It)uykdzRu=PtU|A8O)wuhAD(q8(z*Bg1d!wD0w;ixdQtWZ-9 zGTg=(hEENbpiA;P!RaNelnFi$Lc$tgh}aJM!7eUxIVP%9O3&A z^E@C6_&$7fQn*xA(KbB zEX|ccL;9o_Rcu&t}dOMr;I$b}6^W2O4 z_qts*WGpVP7t2K_jOnA}kXp`7y;xRyt}@)>3ly z4nvDh9pKIYWqe!5Ted5lf6wg1F35PDd?E-xHstu(cMF*2Wn>ZX+c+g>d<7aRl&w#U zr`?e-^GpQ8#i)DnE_xTt)QZkDDtW>z>aQEAK!F&PhOIh5Xp?%#f3ka zt8_kD%Zw;gr=L0q~*xIIzO!*m6`Y$ne*KMvw7*p-kokIFiN#i1wf@3U zcw|#XbX)ELDFaWsZy7}oxJ`v&&?vf!H9bjIOE%$OYe7mOF0cD-?(34HcAN*kdA`@^ z&Ciu^Vr}D{SkXNP{P?mlk+*?LNOF0MxbcT*+WJ7e8LeZGoGti?+O`i_U!D!8_(5>$ zm*3kbIx~)lg^(-RVhmqDt&ahh#htC;Cbci`uj6bxq5FpAFO_9DR-U{ams0f&WKFd&y>um8^U8_N#&m;D~E9Y5|<0k|+t!XG!n zJ&gkru0S>;VI!B1I_@eJ&UZXT@_7B0T07%qEV$of2-lEKu?v1BlF03bpKQ zSz<~&KWh;Un;s!8xnFgZ(8`OTK&qbj>r7JzL938GOlfIkWq`Q;{M(|8PmveRiy{$% z&0L51zFiMzX_`QQ|0mwh6&wNhrFy*tq_V)>)HCjLfaf~$A_ztXlgmhviUX=kM#&l-NbCRWT7k!W81HsKZ`Z%%MfScvoPUfW*;a-r&N_4iSVjhc0U0sd_x<{FGFekFY4JS;9y{6;*FggQ}D|3kypUaAeiNU zW=C`d+$T7T)srJ>ji^03RRtZA#{Yz-l}@SzLmDn9S@L*1pMpc){;=7;V`O`_37qcu zofJGe`!ls-BJVv?D=(LK)oE8HFSkMJ*-?@l30ah{63tL1EH-N7G+&O%-)Xcq&!1^u z9jlG=*-(kr#U3jhQ{CzTVC0x-y%2y1?%*$n=0+ax7{@^zOyP9NG`o4R2F|42 zhD_FC?X+D|Fniv6XFulO>p(Oj{*iz>ns~JP`O=yiIs~rQlWzMk@cWv^t*-t$P)zP} zKG7Z}Pt=QNXA5+F1ab~f?{dQ#nY(^M*|2xw{!+24fA>Q68N=1FBxsqziU2z8<-} zbkrdFwkVn+k|eW9ebAp_E?F; z2Wn9D81T0yB~;)OvlcwGyY>g9oruZ#XP(2uRD?9C)LatgR}j~KPa3zA3R?Ms;|dn&ilbRquPhr0tmtAXhmP!jXbG^c$1A`xiytd7@dKX)Xf zH*f`zo=+c#5FFi!tv?%L6bk4c&3mcw{>k>yM|9ZHQDv6WklF zuuTs#?rU={ z*6#1mm!HpJn=KAYf_R4H%S~0Xk1W-U2_}R?>ruYOuEB)e^HtjisTaD$ugtu=_Xw;? z3hs@OZHdszj?u6BkJAxroIRyMnqjk#)B6Hri2@c&gdr5rQ8pnCNPwa{ms88p)R)T5 z@$2WdOiudQ8V|-4ET|@}i`$#0bB9Ddr2tXpQ8!YjOX|g#Cg5ofXb-apk>-FTAY#FgV#M(2hxulo*K=~Kd2g&AyA;~P`ifq zJm~W3Tms!plGVuz;o^X42H-W2Iv_&2v4Yy10V zaQr8#q%_KABgBiz=}$f0D-vo5ZYxAyu3%(66HpN$Xk`p(NrM&&V_pPcJvO?5qCvCu z^zGbvPMFKk?vG}P`Y|#>;8V=2Ko{rbO9kJh6ifOOk%`= zutG3QnCm!><|f&bz|PUnCKT<8(AqIG3RJih^ZFG?j&>TnpB7MgxsWGcp|-}*q=W8X z`F8h=G&~3h9v6=r1TJ~8GLz337OjwaQ%W?AKbLxLg=qRh2hwpt5=Y|+)J7~vItN3V zlEhIq6FzOEg;AY9bAl(c3khN*(O`k^VdJvFgR>37tdqz7tiWv{YVIAP!~VNgdv2lE6?Ye{h#Ra@Jvr2tSl3Mp$v z>^k5!ukX-u>3|+?oDu7m=|~cz)`zqsU{&SV|kx2XyZ|O7!zf?831gX8|bFn1v{DJ z?jgNOrEyU^_xK6bUdZ@9oOLK7{r;DCsmPyi@LS*LA9n-`p0h(5w0yd&~L z*HBy*l^|NNR`rG#Vo^oWFxp<@hRxjm&DutnH0^60p>}uUBs~U)RAgj$0u>qAky?x7 zpKY}^>)d*Q=IGunJk#Pg@{~HCJK93UY;`pLeXw7zS!)L8h?)Nj}md#-Ur)o?hr!hmd?ZvXow8!mC;My zQgC|tWWSyD<=AyIeiz(Ii6)SWHSRxo;|)Xpj_Ru8?2e}H+{S}J@WlEL;>NJ-1J~XF zlKg4EkWch;^*a)U71?Y7&2HJpL@Ylhqx>AFGym$cQFZ)1ZUdeY_4xs+dtHyhg8;&5 zbc-7@8@#}3vnP9u_mGb+7d35Vek`>j4;myfXEtwp2^4}}S{{Fg5(TyNs$xCiwwlQF zK9K4P3$h^hQBa$#f1Iem9wk))-xO4@cLyFecQAfB0v?acbXKmV6LGG^5n#<3&`I(J|B>G)~~+2 z#PY{y;E*4@4hLg18*m4Oq-!aa-D9!^qQ*ztMTfEO0RK1hUc+r7YumM z50cb2CSdH&!q2w3dJE?Z^XyXmpX)N9l%9Pi<7pTS*dg}9=>k#Fojj^9EU?6Ki@zyO zi@60jWEfrG@hjwIS_D&|{|=g130nQ^yw`$v+p9;!h1%6ju=UJk=@EkQ1r|GhTWI%o z9Nc}hu^mH@@i#Ck2|WZdeKJa{ZDN(_($+I*x2TI;K6fI76B})>3J|rn1yZ?@L&1mR z-=PuSZzKK-wX>0N>e+9J3QNH{TP6v}wX==_1{wdE@B#H-#3qj$P0bAqL1F5s`;uk} zPaxaAh8NkNuC^v*Rk1g+{l>`o8okD0^>e}vfNUDPeLMGM+yUt`p?OyS^#~XmmA;e_ z4bCOavrX8)r{|U3>f4c+YOlXiaLX&WNA#^}qJ>$O;qOk8cgMa$8`U!m+wcXlcf7fg z>7=0V388lOf=h(0JH|MDhDDM-J=!@gPPSvqi{0ab#&U+t!y z(Pz28!5NfdsSrxcB`4T7Rzy8CG;S(mHXC|a9nFUT*j;cSYkFrz;Tm68!yMSKvn2CL zUQ^rZGHl!yLUEu5(P z5EPeh|DZOq{jxT(E?vB#=XLVj_O{qy$wPrz07{fha6A2q z#z4OR)T@;wEIIgjDAY76fda;@)t^O`0#i)<)A@<_`a$gAKS}|spLrhSXx&^Vbyl^` zloV*WJn0KRY{jmz&56V`q11sftR7Sng@bq;2R50P20`w6KV#m%nexG-=vugcfQ9bEJXw9H6tpvdj6uh3IfN%t!?F}0o?BzbF})V-kRr7$fQP(^cO zn?>bY8${g3asTR5F>X{+aJhZ27&QL}Zcl@xvMQzP>eUtF6l6%wO~(|Z#8h>;M?~a1 zrY}FvjEXL`Nmk-8E+nk!Jk`zXvYM)19^QypuS;mVAR$d`1J_tS(FO=t&coNWlj$lo zq`|hkmzo3LfqiUBA3h$IDO7NT%Q`7FC%GlcR#+Z2uW9D#3snjpa!%Y#2s1Dd&lMuB zzmlH{>`a;$0Q?N55dP$a;+BnIxk7ri)}>O?R7yBbHrI1)xSIEe_a@e2g^_Y+)3bwF z`+TAZ)?LKrd5)hO4I)7dSSH8Xxaod(3#IfO&8=&PdLeU5e}=zcTO7Xi%c@& zL7tIFDU2UgU%w*$5?&|IdKBfB+>r0hlLK(T(ILyvd7_7 z_-pb$?NS}O!-);E7r~EGxnOx6r~N?h?TcN=r9%>+@(7J2SD8BOEa~nDue)F-VAvn{ z8!a5GU!=Ccs~T%9-<>5b!~Z+6u8_%lZ&(9KN7#C{{ff(J-yY85FZ{~Pb&p?Tlh^c6 zIE|asD5mZLjd>-NpF=`YZb2cL?Dt>#U49sGb0Zc<@Xd)Wy*N+EpFDsGoQz3k!L960 zA=#LauE(bk-q~I^D!;iyn}qH(*@~-~WTVw%fob>0XEgr?Yi}U1F8Wh}oEr+-Wcyi* zEIP>)dz!%@wk6ro5et3qHcAy+OrU!YpEA1QIg^>Pt?-07UL!dIG5_PSPXt&u8{`bg z8blv>(fmTvNRD@N_5xX^CB#ndM}Gj7FKqZPpN0MY)98=y6aB$p$#vl3Tvbx??0V3> zkU@_fmX}aPH2Vz_duMG`H@jN5J@=k3xsFv|jyP#ruO(Q*Y0R#@R#gQS*cEWeF^X7t^}e^Q=f88{$Uw$)dZ0V~A4^2i;aSI3JBA*R&n`T;A89JU7X!QxMfv4GRY5!EM{8YYQsbwariX0lhqNK7?$S z1KbTuJQ!-3C;x^R#&N4;3fu7;C*|eL-RpTTIrfYwYI4X*3so^dl=E0N-y*BsuukqB}A~`zFS)0pAyZS7~ z!qoDtzp7Z9F?Z;DqnhPIu)2h-_T22#Ic8Fsp$=4=A(wk$tzSD#n%cVzaw9MhYX3KG z_d>&&-)sxqp$Q>#(V1ecZlGx)9-ipmU?G%o>IJ`}hJv+AxIhUP^u%%icL&~{JV(ow z!K|Anv_}I>_v6Qf9d>JU4w6~Z(#IUeAqn|6IR3_Nay~(8p^GX3bMR}ekE-^j6A>%n zvbEymCiyHQ&!T}?TpR7f1+Mw&Tv#1y=QHTVi%eJaD$Fl_&;~#bJq)l<<9dEuiUM1j zf%?3F+}3>0plc$ldt~^~cDZ>|V>VT0-b^xk^nA+9YBPcaRKPwly5dhUR~Z0!hQ(Mr z;zX^Nfk1izD@33#TaOEv<&j#l_fiHFbY{uN=d{p3`?|yq)<_Y+_FU=R(x&{Lypf&L zy)-yqGh}Kf?j5N?)YH>HXys?@V2R2`QOz#&ryYJGnev8qQ7qvbWD!9q^Sn=yfdgA4 zPv|OD;x|}fstGQ=Z}RnFHPR_NRW+wse&B}^YqYTS+weCin)rFm-PHK zh3k>#ElqVv;>)@-103Shw}l>c0tOuR!I)R^JqS|S;Brv_E1Bt8&GKCn^6ERBd2{}C z5@t}~wkDCr@+=emA8?KX@jUn>55J*mQJofJ0y;b?^R-LyxLj0+@; z4sVBYe;H_N0$0A*wAVG2r1bw)ZW_T!zYWR(C z5C8M_<2#dt>c-T0fb%`2gw%)k8DLZ}HAA*DgAYEB+U?orp7@YvDjtoSdXdPxY6Uh9 z(kEGd9XK4)@ltO2C;wV3=RfhhvmVX%*Eb?Kn^C=fAYro*i%aE`$9x5yA1wjIMaeq3wrNq z-wjJ-L-m39gvN6rH33QS|+;NCz6C3^qw@Y}g1co@ph0K@sO7XJ{oj=n9>lQ-gx`Ik<$$kqrIy5+S( zwDmmg@s_5qgk->X)qASSjnonBRtR8$ex!NGA8Y}gp0-S7xqR&NcIlkeDh2;YHNVvH zWyP9>bi!t?r@MHO2lO;0qLjvs5am<*!|y)HGC{>fQu%X-@1t|L3KQBC`^6`4&Q8% zbD`S`+Jd`3Z4LPA>vQ9RFZXW6r+qkV9dLB%p}O`g`I1Jm=a^+EU>wE`eL57N*QuBp zu_YIB=<_#s>uzKXMz3(1M^5{#^W93VkM{sec-vyCKsA`tkIjtkZ^{-68Xtr=)Vdg_ z^D<@YZsLM4Q0Fw*^No}Nx1YRc+w*OJ-|7eLh&wsomAmbXzGT1=uiLD{R?8yFT3byv)b;m3 z7YG|6tkL9XaQ2j&G9~XJH&XMV<~%eG&Rv8F$FrG2`?{;`x8vN0C`+xCtrtDARO; zofLa%z-Z`y;o`jnYR+P4r(ZBb;nRqBoH7HU>b~KFtUfSCA$FL39OBu4?C3$pEYotn z`>z77Jn9+zY&tn*c#lqA?~9mRmeP}@{TS>Xw=iS$sqypxZUxuJEemxy48hl)Gc*Q| zOXG}xvzBO=W8*^BY`M4@biw>eHzUpKs)TY`pEiQrOsCR?s2Yx^$K6zm=J_L?Q)?5G#Ahq^ z+NLsSuicVJ=FjGi$kv5U%EIyq!WGeOkX^=jTvT&jw%0_0`H=Zgg%3^7eUuzt=~;-= zlTmzh)rZZz4gBggRzHK)HRDHH#ygNocXB^k0ilHD!CJ@jOk7;N%kR*Wd9PE*w@zSlk zQ_$V(f8P~u9xi{FA5UN`jF<;j7(U+IehcZa!Mw^0=8gVxwIlOMSTlsHb^c*ThPGBT zX!&VW8H*v*CZ2NkmX{?IAqmkVw38MS5>w6)W67_hx?G|Bg-AxUqt68xzeio0X{^Y{qu0Pa`vP_ z@p9WgF67aka%W$NttxNwe6lL(8(Nqpu=GD(Ar3#k{cJMh(%H6KDPx9qHH-ULq1ap% z_o9{bmE^X%A$KNVqSPLAG5DBwVtnJ@Hx0E;3A;cJB&TmRozp>pcw7AA&f6$<(H+a6 zPcm14W2%~L@izxP!Q!gVcpN%O9{vkVKM&B%=VS8OE8BdY+73Fv+rrh z{`dP93-IKsi5+wus4#q;_{gdVlUsH8g=@abebTwoDq*Yr-t1s!isl0hDRg`k8B{CG zGleCEj5Y{Bg?0r=;_VvKvu^SYuP@2XelZI1_=I(yo%Mvrx3Q$fO~i4V4|SYcx_sX3 zv02u2cCupiIGX->umaEeBjZ+U7bpx={sc64ttN}n+KMH@HPQjv-uIB41Kv?>BYF7_H zc#J6|dI}&sR4W<5G8DV0VfY1bIWu9z80D;HQT4*7J8O&E5lY(0cxt_GISe~X{?q8d zDX+kAiflBOPG!C5m;)^TA-1yFU9C9BIglDs(n1im7Y74<>$zmmy2!6H9xh3C_ zob<(q`0<@?rNs31fVAA~57WWT3-tXdbtl?u##wNm3a{SkUz6 zB-4k_)(p=jtl&xGPh3bal*xzq;(JhJd$n9Xev%TPeg@7YN^2jJ&*%%l`LiSCZAsFM z?`OL?366AL=B9a!B{v%rbD;z`aiQ$kn zA4+I;!8{xL2uH@3qcl!UbD+qj%5NyJ)IfQEj1T|g)YPr8G!;8sp6E9()LKcokQFNF zis^COWXU2kbCS6zMd&hht?N+J|uoyboME z7Z~J;o$<6)^!WaNnG*l3M%cuL)02-BvtG zo(*Z-_YbUd3~BIBunTCMQX-ZfOD4DdsTBNL|IT9kB7ijaoy8~c(*Y`gFkQIU?&rZ+ zht+`vFkomTemI(=KVqi@xglT_f3@v^f4Z(VfGy$>PijTPQ8_pKg*jdEM)E_arRa<5 z&c?9QWC;?0kQ>5+gzp`gf#_gfpMZwjxkUh!!Y9RGJRKZ8CI+o3Wz1OYG$Qtb__An@ z8_-_9h>UyAFn8yqv!a_Mp3eas&dD`@x=RY4(fGH>;ZD~94XmG?oi?=Sv6Do1kf zC?}F%yptMQJwi_pHQ*RQ(|`YFK$6 z4Rt=WG-Ol<*9n1CJB)>}1o-6qyN`4iG2?KSpg3+3E!-ZM%uUw2ea?6&G9E~=h)p25 z`vK#5!6brsU^{P%q^lQ=-erX)%Jy+Y*_l#HgIO}By@4IC^E*&A!g({N7YP)WFdM06 zrM=mv9QgNhrE&8Ng{-w_L{6&6;P>Lr6J4)93!Vs%5iZL5KB=RCg2$VsL4}iUpZTp2 zH(s(J%=Y0E6d*`}xMzjI1U{IR%~W)Bh}g?~qJ{J|YmhG@_(F~-w-6>W5sl6)1AIxX z>o$laDbJ&Nt%fWY2QgjxM2tU9j~iPc_x8yl`GUPauwD?vHZD{03*sQKUSFVA+O9>l zga5{G>L`x*qNf+2?O+0@swO&I0Qo!WEZj3K+m~ejA@gBJUYXCQ9a37(CP8yK6Q0K; z-Q~wL@>k~6Z4@dTsENLl&Ed6miOOKwDe%@G+k={ zlPv@$i@NW23(+2o4lhjZuzR$Zvf1v~`w~lAqi~62_*4PU=%ACxKu(Yxnl_4plI?XS zw6@Wd(AYD;OZ6$-HNgjWx_MaQ*0oj0xCIIdKT`O`s`LMh$ttL}66_Cj@$R%^xGY8Q_4@4q{zhHamH$;&^=g!HZN_vLX+IG9| z?J}$(+IA1d^7)d_r-9vFyD~3zM>d5TT-}P9Vzh==@tI%`$#)^sVFR_7PiXrC&}K(8 zf;}}+yqEmy>x!9&jusgVB5Wt^a_Yhn#wK~_>z&$Th12*Zy=3%s{rF4MjcP2%GQ08n zrRr2^8Ks1+iqt3#RCT+IasNgb15*7Zs`F8Gnuv+ou|n5)S|rjzzP7efZpCZM%k9gi z;V+x9foC?I(pK&sI@$j=cV4ctFD|u=YEBtE#C8yS#X!7X?2)V6F5i72q9wGoYkNo< z+`r$ZoZ*7aS%lzme}gEwJ*9nWTXP>Ik>sW1`DCBUxZKA@kv>WM#UY^p<)F2F|2Dbt z%Nwaz>@e#y^DsBgivB@ds1du3V@7{>Y`e}JYKm7z=mNjfj#CvrML7I0e6 z`hD`!Zgn~`g{n~sSMop_ALzN9erE)WKNhpD@9>!!$xPB?$yw%guu>!?pLV6#kE{T zV5TP8_Ix~x3{8|bGXD5)>x)m9wUKv}awq_J>C$l+2WL^ey39Osxizr%5UXkQaXI8c zN+-0nC?!iYMME=25cI~na`-YGBd@BF-Zs>u?3gOge7~ozLoEEnvZh@WF|I4Bfy3kI zUJ#EonU9nul@#B*R=ZJqJ9)JL+d`^QP6slS@uLAeht+X^hrcJQb>l3C{IzNeyFcyM zo7hp*=nh~q8q^h|=`i4J!mC*^p7Q%-`fDhioBD{2!D_@2yP{hLYi&v|K+yZDI4Auo zjVm%J@GRcrEmo}<{a<9e*+!0sfi>>{q0IK_V=gB~{&bPH{|slR!!LxsZMCCuJgYhpd@vpDhvu3}hAd*#~k1-lq=+)7P_%Yk&^{Y6p23>QpX=5TqFi z9iGEa3!S7opF6Z7wc4VnyNFrpAIUPc{Eovr+a|!C(*(K1$p?Lo1CXz4byis@AFZ28 z7YvSK9(q`0Oh~Q3wpd+uh@9F_As$*?u;xqRNIBz;P!IEs6k$sA=9&YR9x)FDBTK7H zf$~lXjdg)SGbc#<=YmElR%d^Z%L9%JXOs5p4OmSN*y}5l@xaSX_2XPcB&yk4VkvEA zQ@kgAS#JVMLe?h%fglx;l@h6&9FK(a^%6_7?LGU$m*1Q3zoA$iL`6MuM3a75&*$lq#)wiMSEKL>B}{lvl7 zOQ_xop;1HL@M#&0ZXtg@1|D{E;)CD2ifnz|nUGOldOec0a2$yd<4{yOr1O=)dy7bF z{*Zy2n3f|ZxBMU#NP9)pQ1yUyWHcj`k~XRCsJ}8oIJ^$2IV+RwttooQ@ALcN}Ba^P=cWPlM|o-YU1(br{rGNr<1%iBXr@$bu9uGdCmM?ikb#E#cQ; zT1#OI`Rw;>Qp|%&wtn-jZ!b2xFe<|j0ja(&{s)_oZRp42Ts%o8xCg+NSDhnMY(q}h zJ+e7wN7_~pOGx{V{RoYaZsDnf+1<9srG!&3>2JaS=QE_vx}=&6pBIiMO&foD97AVv zonQbBO4R(0#?*)xhbdEVbA_^RA$w<@fn!=0FJcEa_{(MOb4Zc zLWE)LNVN<{D{QH`5Sb2TO{ZAH*4V2yYVZsE)Bll-c)6&+xe7a}Rl_XcvN+~PN38)s zYQPS_kXrVcb`fmb`!D4jq$PRe}r2f27gDO0?3<2?LN9pZher zL_-eL;c2TIk&ujaZQc5OuZJ7@KzxDN1f%A-%T#kXN$}(JW!*IeIB|r*8`_vSe@gHE zNIY!Y!fHYCNtJ9B9-+~8O8d~w2X1(=E%u)gfon?e#E955BzEG;ZYUeB&wr|uV&kyx z9btf6TLo65$uT45xPN1zF7Bjxz5zrfR)}qL`pj>?4k2<>Ki_g4nzmk@S9ie)P-64^ z#0{?cFG(F6W&9A^pOFi_02&b@K0bMQzP1d9Zs;-@f+^^P&;>5mmY8aFS;3)SOPcfO^-u>PQ4=nWnS)`w|{A&k5*}W1G9pXI$)>);=8RIVg8tf=FzklqdMsh&V*@8#$EWNP`Pp z@RcMr^<20xayk7hwFp+~U-!RAy6$i`zqfA`MN93LqP0g*RMakN*REZ}E`r*ds#Uvo z?W$F+5kb^W?NL%Y#HhVO>^t_U@zl>XM1v)0KcJMfO(d>w5aE=2WrF00=2ZiV2Op>a0o zHR=q{j3l6P(w5Y6aKoHUMY| zIKF!+Xxlpaih6yD8+!=VNH0xk`-N{6tjz_`2nd|ZC;W-FsoEOKUC*TxLu;h#J3;lI zLe7e#Zi+CXUBD@5QWfo|7U=j2dvUn!_!nSo&l`Fo%tytDOx3L~xgyNlh!QF`+x(lY z3?Jdd!1IwA{6h2xOW^&=GYi!xJM5A?k15c}{-YUf8_L79En}PS6aHDe2b27bN1Mfz z)E1_Hy$7v+>)Anp%DPtxSm4+bxv{WynZkrFLDL5(KBxyz2YVck%un_MeYa-F&=wb0 zT%TVQI{mC~@&SsLTRj_eWxu=Wgt~n`JCP7hK?8?MD=i4KUd$+L0(xdHH-Fp!_|K`@ zlO<&L;&W`*Z^6_Hu zI$Y~2@H;v7hhBEb{@UWVHHrVd8~ElL_o-{*nH~~;`V2;)VP}M8BIyUFiJXJV=Zgxf zDu7ZfJnmY}i8p-EO@ZI%#FnWNx4OiK)26kJPd3ffav5Ah?;Qc2iRz*Xy z*}FEN^ZdK(S@LH+kvPxqXQE7P+&J3V(YB|PR4up;?dqw;jH_uy@%X5(56tyb%c!DS-JbKS}lIs_KvU z)zAvmdjUTh+JF7T$vh3W#*9cUNqXhpGGj5BF&NCMftH~J zchaiON+iO}`2-@$spZ7-VO2&<GI=6ib`TR@q&a+I1@xsg^vi830Ygn%e5n zmh!`+lo!q-Rv1f{D6*5WdmU1tb_wL~R&MaFvnKWb8|?jeKhS<=<)TC?Q930GNti1Lp3fkIRS8cELY?IG>nD)0kTU zt|3n|Db>Jt4R9HLXksgKW#iV&eaNXAg4;^dEW>QS3 z50J@7aBjViBhFOoNO)LSr7JOy7^pZlQ|#3o)QwVsbfdJ zGfk$#8R9QzMW_Y(XhY5P;w5@1IGfDm_&RlKRxSL?(0c{z1uQd&uermKEH~{)BAPUC z!uM`2H4N&yb=T_WJuF@sXWI#n73!v6$kqfSQ@$wdcPvr<-jG_9nS1|YKr;AWu5FOV zN0_-a({F$L!QF5PHFOCfUnfz_P(E9QQE}#HJG(>nbx>Zc_(zSD04c?bz9k{{MR=D` z9$-4wJ70|DSd&vh=SLI#`%yeUOoK43GK&3lEHbNIbg?cJ4Muvag$5*_9ef-~U5x7D z5~7+!>_UF=#xM)f__W(e{o*Wd;Ezbq{KIWKeV(lO-%{XBj!VY?WXQBRyKKcFLT!K< zxU6{~-_X3sp4Lol=@I_F?9S;O5JuEH6=pY+&H#t*N@{klPqIKQ$tXFze-Bi`4Gn4j z6>=aN7180CC?tCfu2};e$)pZU-LC#J%WSRCVob65EVr=!-?r<$H=~wvPb%z3ocy@@ zaf%(vvryn(&)s174>84x3G!ES)GXWRx#9Sjbu7q@5e56CR=By@Z!O7n%ViPA-(`mT z06LEokVh_L@9jm3`cn?t$sN_NJCAh}6EB--&u2~w>QSRi_Cv7C2RcE{{kz&H{f?`RgTzZobmm2TPtPx&;&9 zyoVpvWel$jrwlhgNA_j=_b>`QFp2D}+-v1f_EWn!$etFIZvyssy6U@_nWy(A_ugl0EJVR*q$o)k0N8{F1x_OQ!SoA26=rBEuw8%^ zUW(}`^~*540Bzw1Mm2$9xbV&@z2dZ0QA(&U<|Ci%8)<^_!j&; z(ruh28Q{)8WEG64WN>o>ScGgO;7q`U9yWQA1@&FyO(o4mAiC(Q#i94dh}KP~uNrgc zG7VaWjrY>)n0-6)e{22Y1_YFKc{H1Z_0u>2+B9%mr81@wg#uYfSD_Bs=4Ce92{C!G zNuGYomj^#`IGwPp}PvQb3A$o{VW&{KsLG zkH0#Elw0;j@a*f88^p4w<4BDuz_<&XfgthpgoM5r zqNUYFpw45x>299~OYDR8cIR~(9@j6SV&d%?nOy^OVsV9H$p#1G7Ce0WQs=77JSz0^ zpNd`9xUrwk`gLA^izA4*CN%)Q?MCC#q|ijG8>Mz`v`&hT&IQX|chTF&;C0i!*xUvw zFALtMTgyiQe6Ni)OP+JSrXHky3YEthsQ&`OA+g^gc4lqbsGmYiW|&`xRg zr}KN_ms&L&g=w_MVgZ4CT30Q;*Rkrnz`a})N$q1YG|@Cl&HCmejHzgSb3hf#9uFf* zE!$$_JfYG;Gb}d&G0Np(#Kpp76e@-^@e3+^mXdMn6PY8wnYyE}I*hWQTkv)Cj|z!( zxtAeqgPbRXACOumApO|lorN1g+9!^grVB)AXjMn*YS1S+JUh-NUPW}YC<)NbOV{`n z6HJ>u%+J5OstWaN@N)h{n|lk{k$31GYcn>Z`SzdW+el{Bdyy9P4yeVYL>*dw1`h@_ zW(cVfTA>M_?L6xY1kPk1hdSJU^SLYUH6*ee6|AL7Uu`qg66>mC+u*!2QITa94 zs-{sXQOpH?ZhpKh=K(S^>>n@LN z7Ryq9fKEUXsg5EtJ8()}6PPr?H}wNoRq3XH62|a@Dx=kz({Qx(2MsV#>Ni$(2HDsbWnR+Asblb)6 zws`AQ+iy7h4)9xS!JNSn-8iei>Z7IER6cewh`*-Rs3B4ePLKH=JFl6vl{2cJ&vwGS zcB^&(q!H=EW@w4Xh%g_4Q`Tjko!{xOaGtpf9I$|JD5bM<;5Ld%q`sZZhUw~BC>Z92 zzp-hGs`Qn2P-C6U=n4g6>=pj2bA}iypM^hEe+@9BSv|3G zFsP4MObK9lg%%y@O38PCGvY~OAyeGU9FMRVP2?mq5^{^jkAw&!xtsB97CZp8KO zRim7W@xMT!wjNVueMWx2?93so2XTx53zzM0qE|l6ILG%N`trZrFptJyJtWAjmBOHg z+@g-yx@j5)>S+d=v42mJWsd)HpS`qQm;21b|HwAyq_ScN#Re~2U@%toTJSzOeht?W zsC_;l;xbi_5FGf@pnt8B z=fjO0J0TcN*sG=Obt)gr{w&xVYSbU@3jMzR5~R2+lXriP8g5qvDNlAWvgApL2CvdZ#e=?<gGh*S*o5|T(xl8ryH{N~(G%o&9(wECGp!CIdp7YL?elz0o zmsZqjm_;Bo&Neb8IXZv)>UBhA3gUp(s`mxO2bYu(8yPFPRpcv%-qisuU>wo#0F3&RH6RF7 z^TRoH^rk--Uo+=PZNf$A`o8jHUEcY_pie(K?Jo7KC>@O|^_Xi2B^1vBH6`O2Dwg4X z_9W#=YBm>03$^L7yMcl4+0opNQc|RW@73(kLNN|LR0u~Ip{IjePIxggwT`Xngan8N z+4exyx8gvE!9xcL7elp1FsE4;K28~*knN~)^i~Gci?>5MN8Y8&7N#9 zrnoJ_g1&C|`TG?V-xqg*{rU4pQ(4^3{(rAE0Z~tV`!yeOGke#8*Fpon+=p%fC8rj{ z{f<63lxHsj)DMTU8qF_0fXTFp)0?oI3JSgl@1yd(^d6n6yv&eY%hxWWW%8#rM$yx3 z)LE+};xpa2sU>@-p#qfm-Lzj?zL(uL30wU#pazjBIL;kyEBmk-WEit!i|T$pMI?J} zUZ_DZc-h`4$Yth+6}&KVtgS`TU(YMrnsC2b)4+r}GvHK=I5TjpcuXQAm0affJy%2! z-^_-Q&{%1}YgB7V(NqvYu z+NyiB>ItE&)DVE+BD+G@$_LDop1VTz> zgLQ0|#9(rvWe-eLY8H|R25S}?kRI66RHVn%Av5gp<#k01E)XJd!Vgd=!p5}=hU>=a z3+e7hRA@TQvQf%aW?6hD$)+G$rn$1j5FxQ@QBaiV>w)SV?RKQ=bH;xz&7ht8iSKTO zgcn01K%E;6-^|(x0zux+9@IVT>&Os7AKa&{6%b4sbwPn`H=|am8QIQ1vnFiX0?weB zpL_xk=o80pAx=5uv)5Nw=E4}k&Bt=X3(-CUa#y_F59X7P@erHHnoFKiq{p8nahTF> z6^Bunoy@;GI2>JVW8cB$yw67&+jO6Yt^|WNOqk!?x9Xkdkm;UYJjCw<|ETCuBjK=0 zpQ7kK55Z0wZjA*>>F3GTq@e2Q=HLDaPyM4lCymD5vjWKrNNn~TSK)MQ=BnQ!*?-7G{eAwi|>=s1nVUAiCpFk)j z0!Wql(@PdIy_ImxZ`)`~^Ox@FrwN4E!A=_9eB{AJgb-}mG%e9qiuva9USkkd2zAfv z1C9#KZFb3{6_jHr&OLTR<@VEhUGT_yHHvMUd4Kx1-tp~4ytg5w%K<}^alI{yaGaGL znkTwGe$){7R=#xU)mZ!WGp9FfrfV)c z&Mu#y7gY-7Ue0+~$eYpk4=eSlNy3Eb$eD{OVl`h=LL0isd2_xMiQ?sy?K~V11n%~} zyU>NvYo6%Q-K-ClS7Hr)aG8J&+xq44-^;vZdJUjw2tF;GYU7;@8x51N*8T5%tAV*xo-YnNJo)V%Uq4m1TpAm!mr4}N6HYEmlcwvQ9-u&^0@#Ep~ zda$3=BjFCLB;V%IP06o=sA7E$TT|R#GOn$ z@bm7D&mII)nW!QX!~rf$7wRP!j-&oV(RK!lcKj}yyIdf*PRb{gw!^f4Q}UbAf;A(9 zi`c|2M{q`yj+V9L-Udv#C{tfaiDHmoTf8a;nE8G8nya>0^FbZXuvNbh|2?Xt27y=U zbpGXd*zNa1$#&Pmzlm_|Iks7veLglICvkTH{rc_w!!s_TfLr z7=uHKNMshBQt#VBQ-$^eMiT?0pU|o#?*0$()g{)_83mKX?leD7gZBG)En|5 zN#MO(qiA@Xo+~?3n)z)o;RM})$L;tB_<88fp~`f8r-MqmcX85-dAp3`B==NCUK<2` zkoLC)%=a4WF=xW&)hRZoZ|G!6r0legTx>#!YtR*I`WfGwZRxVsedye5#kpskU>M>Y z7SilA;^9QrwCkFRXGTEjET`4b5(6!Xyv+2MGnd)3Ocl?udajQ(3x))CV_lyE+X*We z&u8nbNtrxsbv7*u$zp0b^Nts$bS51Dv3V;JZ>XLlPJH4HkxpLqL3$FjBorz;cUwFs zYM!p%+WZVA$a(gSIU7IFOUGE%OsPSPhQqW?T?>NVl5m=v4c{C zMaEO_u(b;g3CvosQ>-oP7Tyi26d#Pyq14cpt+ed$8Ch^zK7VvL-E>z10-~w>o&P*| zK2oxyHZxrn!nC&di=riDl6WBB*d$z%7f~hvyPb~JcTiHE!N&;HwD%pbj9hCOO0jIV zp*93zxzQN>nX8#9Z)M8ve6Ks;sl{$(?bTo@e6!H24?KoVDgzTYQqIBOhd#NiI(a{@ z(pUZ@u_vR)`c>{EV6A2%s@eOTr99K(fw(|Q$0YMwrb^BQTFcKgz@aTv!pu#+_naa| z&HQbpFTW%Rt-R-|?I3{>>4wj*8ZSyCB;6};ZS}!-kKZ@6#VOMwj&*~+R_Rpl{xsE} zdto^J{M-|BL1s5Cw?k}aZu7h>K*#5F=?oE~VCnuiY_-#tJN-=SIz40uAKFSR`VGB- zOLN{fETUcscVOOrMTiXL%l6`$*!Q}Ukw)QGozi!n@74+H)a0~=1&2a@3adgSc4tY@ zD#DqM!ih50GlIi*8bT6`M>N^^5TdYmx?`*EZ?r=x30Hwn2sEI~muFi^w)#P5I^YrbN6G=pWyxE=kE$r;nrS zMK4(ovfj`DSLa$3rkTnXCamTdEsVV2JQC(22USZX68Y$S{VA;E810TMJqzHqjs3gs zzO!zfFga1=QkbSi`Tq6d zxoq0zI-uWuy7X|KgU77?L@&GWj59S%=1Y5WpYr_Xs_pbQD)OqxKX0zoT)9&P&i0M_ zKw+z=dBj+FZL^|1f3P?+8jUN918sLQ?zN?#*cVC=3~sN9E;i~{ag;RY^&f}F|@y{pif_a%kVIQ)u(u9z~-YNR*pKeQRNuKaYA$gD`YRP_gA$rWneVcJt|Nh#gDBp>1G#}6yGq}9UI;q2 z(;|~*+7o^8%jCu}dq>o<=&SQ|Vg)hlmi!QNTJIoVPVn@@8ICkf>ZhX4U$H&s&OAUV zzIJ)&LO%e%R~ZQ>B?lD->71D5{;pFxaAuzgW=^a0_GDR&hS#^9mQ@Zsw7@ylbmtx@ z*Cd`~Ey?&W44h-OIvxq-4+qA(be!(xCShi6E$>u6L-2=8azUIe3hW|CL8yH&VNPrR zE1PKKLBIe#Bf8f2-u1)!^qC(O^Y1WcH@(T(hmevkb_SR7YkoV9sTjZGWTt9*kVpbs}^B7~v{D}23wkWpDR>9)grN!m%YKbH0 zLuMn}6+9TZGop|t_7CeFpOHF705vT{e{h+45!2_f3-#xA4$`S=@-l_ntkaKCJ=8e7 z@_giwCM!o)Az(qaTp@Cjgr8IG_|;u2%l9z}c!%F@xn-1s`-4mUvd`UG|C4zx75^1K zE_$l@~&km{6)bOTQJ@3k}qM9zJBs{w7A8gEC?{azE2TcXgkJ* zhV9&xC-%WZ`x;-_U)5sI^Su;DY>Yn4Q5iB`s87LeX)T7ALx&>zg7^* z%rLJ=1ptxFrJ z8y18EB}7Zt|4JdCuyOT&?!a zSY;<=@J3vYd%sX8;66f9D=vrD>;*2#8zcI8=zetgM#88dDoj&Yn7EY=-6vy-v|y%k zZJ{x?Pzj2W3>kJ=ZWTGP#J~Kxzr_P?y7HWR(c{`PPFXD@?|pUi*o;;F##i%3l{@m6 z@YJ-K-# z%tRhMN?-VJypS0g!tZg$jFzw(W*d~Q$%lxF0K+kA>OL9?HMg`%|y7opW#4##EYr9$bcp&{!)*YB*L znrk+6#NRfp91liWpUII~zJOhIB4fpywUfN1y9=EGeKcDEUX%C-bjBG{LXa}9w+bNy zm$jnF;;_!2Z-uXj5UL9fwB%@{umEsbo)>OOr9UX(7$bYp^%=(Y(1u zR|d!5STQuCT(JSH;eFs%SslZ$h&Fh3E(3XEKC=99Hgnc>t(M_zy^b@|0oCRkZP;0x z6Dj>Qcn@pMZw@7mbf_IBjQ}1Akv^YsFJI>1Sksgch47y%x!%KGJPOrkTbwR9;VyaN z=QFzY=*I7)-$WRChUGd|zuLzJur8yiNtgTTr;Hi#58B@Pn ztKsyoOm$s7c`kHs93}RuqmefY=`n0h03mTv*XXC4%X)wg6$}ykaz8Qi4}G3@-1lvb zFFDa&RknUgJlQCwxf^SC91Y&1`O)&`MdsgHYJn4r<}$PP`3XK!vp+jrJeJ;|+GXQf zgkEkic*+I5+3i5m9dRbwvFpuGEAf^bg4w*53Amk^OKm%-($pc1`IcWbKZ=Vz)8h|5 zBcJSPWx_wl#sih?uCRO{Lo zLbpHpz9zR@=)JP;3WI01vzCDru3toR<>how{%B5ylYvet7s#DYR9Y*Ud73>M)QH97 zWb%Y5S9LDf*&syG{lHS#oZ(b94r%Z6hGs9$w-8-}*pE5;wHY z?_{h~OMJ+?bQ2h;K1-ActIR3qpwkKJeT$s0VtoRaKlu`o)eH#?jQyB?=<(~m=807@ zC+)ZR{uZQti(KNpuvH0bSqM3fRke6y;tJWum@gEYAV_Fm6)CZHz;dZq-!`DkNgE2LH7>Fw;4421iVVwpJ{qUa zD_L|Grr449y-s9-mke7N7=1rR4EJu4^#*DiM)OAN(R#O4Jl!_@(*XZ+%N>||U}6aw zFt@uYNc#JXFX__VEy&Od?gz0I{=_7ZoQk1XLp((VUL;I z2(So`lJE6wa++b4xg0vSMGNKW;I)@dMe%m7?5T5~E5Y>!TayH6Z(cHeDJAXtJ7D?I zx8oMcF@{L3ie)}{;$)`OT~MzJ=0oDYBGi9=<)u)gGaxoVw-=}X*luYH(Fl@_SPN~U4HRKzWuR@JDW~e1_}R?U8@ZYh#ph% zJBiZfg>o)V-@-R+ufyqXb1@yZr}J6biV=;Z zQ$n?=Vyx!yLn-q;#9C33LrbBRYsUQ-@1gwX1QPM8r5+of?#S)xpmpDe5E zQ(pThVItY)zq<2WX`iwuOlA)ynO8~%91GP$c)7g-$>xGd-xlCuqfT}lo7fy^!gl;5 zQu~N!a8E>nVaG!7(bxUNuf|`7AT(SZ^=-6R}T3+HyMzaaPh6?s7_^5u($6f>I1vr(5y$c+L(f_xM9r zh>UpTBaL+bPHBikrLh%=#&|fXfD*czb{M}Jdk=)?lNy@DGpQRy?%C(r9i%2kSlKLm zQhiWYiHCVxGiBh!HXDr$fKZ`Zuji_C1pEwfQf}`!rGH{lY{s7>-I-s2JFaaqS|!{V zD)naENnt{hf668Xq^)^jnp&D)#nhz)34QGQ_1Fe&9P{GDb`M+!UOK1O)DrZ1PQNrQ z+lWa=YF%%tZ_{13k)f$m+ZmZ?uiF^Wdq#mu;QVW?z%lMd0~cZ!!=Ui`f9=m_!4ZhN zp!L2=LZ3)ym6;!C)G?C2{fmjMe~VzmF?st!c)ZJi-`rhUz3X*>gq>Ix%pL6B)I|KV zuvOzrdr0a8pB+}`!kQeGppX&+Ldj<;-dyk3t2ibtF+mBLjnpdDFF-*DuDRUi6nH}RIn`-;5BArhqyL7Ku=;+KGwzZ5Kyeqe5LZWzX5-A; z{E9r74HAbn5KD_w@LvBSF>WuLO6VjQ^u+-fXSs=vZSYVMFjk>IJ@29F4xOXC$VFl0 z_+UZyxA>6rKQ}f6gTh_m@1~R`nR?S@fx}A+rbFLu96q^Z_O}OZnb`kW=6RKMTFb?UFr%fD1naqPHLZNbvC@`ZbvDgL zjp&&!_)6<#2)CJisPlEeys5ZZlCk0S#;@o~Rj70Drfn-4arp82zNt=<7ScH2te{LF zuKl=zYFk8K*Uzx&tPMe~*Q^qsnYvf88|bEE82Ym!boJ6pcZ9!e7UVOC(DYKHM1%3pu?y~0 zy{`_ldP0E_DcU8UCVqRs_>uu_OT-vk(MU2gbYtd!qT5uCb_c3)aAOGmoSxT;0VJ-Q zNwFbawv8A&25a+DDaV8&cTGQ=*_!0jbl3GLpnvLnStc(qeYH`J7Az!DZ=bt^xR`T4 z(t-Hn_5~o~JIty{w4wW8m__G73=}^wF;b%aX71NB8AI$pbMd(Ad?`U0Dq~sgbr#rg zwas7sk|Z1zTa`)8ji~+0hC-2mVf#ED^u|@&zK~#{s94bUd9c>x)%sc*!3rC!f-t1%cjHyC zo(|Ap2_KUi-~JAv)rUCmou46DcsI4G2$-^NcU@~NaHxmW08vl*{>FbM+w8u0m|foL z$5XU`xzjIl5T@iIlDMH6#VM*xVG7FmLrT+@?ONL~0JTi-4S~w^jeCR`B8PKbo zY&YjI)yPR1!|Hwkz%rZDYoa_t=?h0_BNTzY-+ z)}aS0`3I&;26U*}Y`$^c0Sv0VoEe9|!;Tww6w2>p1ovT=Isw$YSBi77&TaY%;X7 zXX2wrwC9@lp-cF+N*Uh$FSwmU(rmUg2fm@js~Y-Q-of+WWKi21ao9A!U@0{Abt#EY zo(f^|s{bDmZVhe0v%hI@{_6}XudRMJ6P{@6by0lGk&pC1d7rI4B#Cps_$|TU;T{ON z2HMB;RUVlcLd+UjsZEA&jhz8yMdNKx)&Y-57Bh5ln~XA{C06@0q48AudVEl9Xa)^h zzmbyhoG8O4z462KGe}b({I!Ko&E#}d@r+6&ZPrRa4t7y5j>k9|Xy@~7qYDR(`+LAWqJ|!;>}Yo_O8M_mzMDjJ#0KVp_Mt6xjGp;2JSpk9#{q&;WYEyU72(vsOV`@ zKe#F=jF0E!D+i4Ikdk3KXk^%8C(repuI|2~MtH7V!T8MgVsy9&($GjlAoZH10LSf zJ-SlNs6NL>m~-mo^rJjJj}K`$d^-Kvcry7ouk!JD(U6udb2r$tRny0!&*=w0Od9u- z0y|t<`#s+}#~Ck>UC|&}^Vh};gZJ>G(9sWuPmejJSacgqA^bjaRgXtsLi}CYz%T@o z?dg4@jKhU(yzL2rP*wI4p)GN>P}P1#C2{K}zq_MvUSaHpj*zfx%qUZC)fOi{fEaSnmh>WpI%Ai6>u zAHkmc7K|9yTn>d*3fnmW{Bel;T)vdae*FQ_EOckC@*!&5v&JPeXpyBq<91YH5zBOf z`=vSLkmno{E6F1=LWU0LaoXa6JuY*-4^6PI{hlh4kHM3Yw3*Rvtau`ZiuK10(u@D6NX!-NJ)bCQOv)jG~ys@&W~G`VMAu zQT9nqnLGM~|DlG^MW1KWzU{I4Lfx!81lawm^Y0DTr%eQx@Lwvl+dSM^o>iJm5;#>h zGe+$}R?Ue;y15gW$M4Y6eZ+C9ZSLXp?|J)6=D`g5{0Yd8c@rV(AbZ(%yZVT;AleNs zOrKkXAL}+wgC3*En7zC=af5m1j zTE6@7UWIrn9gsFIkB>|kN$mn}T9n2jaup^q;f2t~V`s-S9o9q_N_?yngyAGYPbzU*Z+wWt}Gv1?b(& zF%DLlZwp+%%&6%L{sJh|AsHRhCZKWDNT(lpNV0dptT#&pMpxm=2Y$PrmB^)Py27A9Fb6HT^ywOaUXk zs47*8&Cer;u4UD}tI<0R{V+6L{G$&p;7Z_}q&VqXR1#Yt%zD-PTdHhCM9H23@3Q3@ zVp_G49f<_XP9+-hfNiN@t5Ocs(|z@!4t>nKQ*nm;%kHEbuzA^ucL~S?f4D~2eOWpM z=k)zw_t1n-r{CoPUW*OC<= z4ZYpG{zsN^*{0tMvw(Zo3W+mOHvG6_TDSI8Y!5N88}IuO%ySWv-rPN0+58C%z_TBQ zgqW6d6~WUBo`=U-<@oM zj*MrH&zbQs=}rYhM3d}uG*t?rYaa&3J2CrGaIINNJLh0#G zMS533oUXa=+75o2G4a(bcBq{UN?P<{&d_U33_{6$#9%U+#hhE=uVb3T?xmgl)YT^S z*y35iE;;m&L5Usn*H&m%%0~KuahvBA7pz|vE6Tzrxla#Myv^f-ONS1v+a4Q~PS9-D zED0@yg)lQ0G0Nx$6*MEYLb8(b$w1Ys^Mog!eog^gFgZzm+rVt>o_%=2x;iUhNjofq zvpEkhYqCCD++I9VNRC$dCuAXIkz&EOv-c2PYgzl3KWw%BD>w2+pI3$$*NNo_!8YsPfNBISe^4PYZ;}^2e;0(~4a9HS>fDYvGAzevK%(f53p%7v_A9tZJNEuM&^UFW%Qk9}2UkY|Fqcr*Z$bo8 zPnLog@S9r}DgN^eomPF>eq%|WP>LHxuK?lXUziXj=ZmMX7Q{6bnw^#JuXfmK(3Cc^ z%1Pr$V>1o~NG+2SK-1Lrupln{6xpI0I=!)OmoTAg-U4lp#4Y|hF?u*Pm*4&@H^Pq` zg=xnR+*u6PjNS!p2rn9V3hPS2dYGREE~3D( zK3dFYDkxV|I1&yboVB-~zHxIMW#kDIxTf`>n@`@Upn(lC-sb=?#O>@Ov}<8yzj=8x zIgF4ZUi8b5MbB}yXrzA-aro;lat-x#_T1lX?(=$H8K|0TODQzUdQVHsfDMhU$K7_r zcf5!Ye=#d>#6D^ULVGPimxZp#lZ151lyvM%Rt9H~P|jmziMPaO%c>n)-b!) ztxc<%LL*{bXd!CI^Yi}f@(UJ2Sycb2!)6?eq63G!?{9A6DadKb6Ih4jy7-Y>`G9m`}sl)6r zj10!7XY*-qAk#_lB+v`V%zH7bxdMM`(AWK6Z3m21@ob<=n|SI~U4t|Pfw#$`B@c`f zt2sK>wL-0m0k)U)^*wAgw|M*;zzzZDKw3D~>)D25CFtyP&J%Lf%hP4Gh<9=zp+f3K zeQq9*IpXn>y4b5yJaEt6Xqr~JnL4Cej-1v+mUXc*)ovkt2_-Ia7j${_)-82N&LJgP zH@W{gvTaUww2uS!m{6O%;&OINx1vaDv$pf5!HRP1Z>F&!=y~WcO`1t^$q~zEntCXw z9hLFPtq9&F|9dVwI%9;;qcHQva(JH)e8J9oYv?~46FAIHPC#@6z3f7+>r-;S7VlDW zR>O80mG#d$&)|A*n}a=CPdr2pU=^y$fi4Lebsc&j_y1O=7xv8E@B>oR>bb90lAMjn ze*na!QTis{jt-a?yVa93(F_9}$7pGm&>lF)r0$>c&r1bsgfd;iWe~uis~-Z9Sfl;6 z@qqxSuWHuK=w;!xFF*c;rYX=(^WF)04O%#UV_YNl8%ggpr)`KhoUy(6^ay z+Xny6=MEal^9;IQkLpad(Aoc`v~vocaJHj*V6=*q zMM?$j8t|+2uaOXIG8(&3{n?B5?MV>__$S)8E|l}#4;01Gu4ewPzKxbEBU2}riXAnl z*-6%My!`h}Ps(|br9VS%FsuvvmW10@a)h<~ohnS}@pB~;UzYsNU=~{<1G>S~7h;4j zH;`G4x2ilhj@64oF3wD;UX>m-NkAmGOVo^C6(40vAe3ITUjA-++Gm2{?w%JzDK16* z@Sp!)8P3iHki{A|h~(?)S2_g+bFJSgU~i)L)rAn*9m_G~jYS!yU8plL+U3U{<=3BB zG3DlwN7r7}X!=^3Z>T^9KOItQvoFBiZmGV-gKt-vzY{J@RdFhOrLV^~+rs2dMCH*D zR8!jbTPlWhcgo>paOWVGf>sR6$FCMr|D)JQHB|`zPqA5ka(u0_I;8rNmz;TenAIDz zV^aEv@(T2&GCe02SxNJVQY@fa1u3<4mB~CfLiRP)_K~2JOWmCA)k{HBU(Q|F^H!o% zF?6T$PwXcon_*RNZbTSKJuhIMzWb}q)-7{*gblISc=bIQeBW7-y|n8D@K7O9YV%11 zE;^_{t=Ri+_1V;W1Pct$vh`-pSgy6-3;BODi}%rYtY52}SbE_6=w0I$yA@oZ#(@`y zPz|`a`rBa7+=K9rpVh(T{u+#b&BK2xL5+jB#y@)fP)9`Xna?vab4atBDZ2)K!n~2h zvO$i^cDgkiW7Yms{wXI1*yX_KLa~D~YK|RkUYS|xx+BRuw@I;yed;4u+h=qy!A853 zWekVn5U4Fzftd1`d5WAa8C=dkaYDo%qQj z=__>e!!>yZNXPE|(R=ZwYs(Tc|6BAH7&e8oipjpt>C9dKsl{aD*z&&%<_izRe;2(; z4HB1NZO?fUd}^#4EvqJ?R3ml*HmKG5Q%HK(zl$XwT2bS_xfH04S<2lTx0I8b(QyT{ zlV80ES{_u|m>6k1gN6*+R=LQw7RROf1p$}p>w-wm*}nmqP{QM=7_J_oI|ce*w0VaPVv!9a z3%TgU>D!xgGjt;7h|RV3cVHsQ*T>uK#VWNO@ZcNnu|a}9fjcEEO1B!cBTtL_ z2qU=rx{$wzxJgAg%}E6JU@nhf&1|RBn)HAx&5CeM@3BGH3i7on!{M>&V0h_XY@S`@ z6c-uH$&KXB!Y(2I3 z)kIn#Rs{326=vJc28i@n_^Bz_DcfR�ljUuQC7N7>A|+^Q;W!MYRlxl0(E|( zFii?`e9jtBcEsh2Xde3ab;s4rhFZ)_`d;^PO((e=;P zi$|mZk+6F8+pR2$4)iu@QcSMbWhK+V#R$x~&J4m@xMBS|vM|aq*0!KE1m(P3uXqxl z(SQ5`=52j`Wnc4%JJw5)G!yRWBSdZ5(PjuK&aF_dOs?;ih7|WpYI-n*Y&<}|&$PKk z^cnYx?B8u!gd$~+T&83iSa~@Ga@{W0&$V)VcW&H|;~wzwD;BKn#^%hQpG4cqOi?;H zv0SW6Z?cy3XGy_m`;RjA_~U1QWIe$4*QiJDG%7*t4+nC~s+3?1`d4Oc@^2W=m z0mb4GRz#nNO2IIvO?YK;ab*uN_+3@qhN{7dV4nxy%s)nCM0aM3m{-aGrsB%U%hPzg z#xI30p*|)ByTK>rSoVsj?lCu|)U0<|ra2LTOUIG&Hv9h5d zH~Vj%m%px=xzFbQZMEJlbNZEp{m^hmhJKYf_r4T^%DSM0?SF21pNV;+bEh_;y91uw z{Tk5Hv-inyrB%S;;SP5tUMz&1C{?=nT^8@{znEMJ!FPp))9Mz^+L94nTr|-62@<_5|RgJ0qq1a_L>o3?Lg$-_QFnw+cfkKv-RzVeIqKvy( z8qhSVgI)r9J)h0Wp_~9cyDFIGJW_#d&0t^sGM*XdF}o0*e7QefWgf5!TTB+$QRiuul{{)Gt9#6HEdt zHOV04u1jC+HK!Rs4gThNf{zT)h*4yGN4}p(q{YTKiO`h9$ z@0M>QSyj1CsSDy)@RTk96@b)Z$J}KQmWrzIYIGzxD#Gn;P}~F78E7X>=zPqjRj3vnz7KuYhbMhZ_c~ zg)ET3v{H+V9BC7<&(yxQ&`X4)4#0tSy7MIgaF)A$h? zxIh70i?3tiLvr?ZQ{g}K&e#hF$fzJCEKXgYmbC^or-$&J)%!Om#rW|{6*sE=M(Y)wCdJrCMX9-j+fgvgBL?Yi^P4Qw%OHUK>Ag=1`axVOeCXM)V zJ~NIpUc`y9REijCuV`jRD zS%2%KU}V(c_+UU~02Xhiu`FuDV3xDKP5f6+%|N{HB9L@OQY%>0%X$P^y4_>uYKPlu z%MR?rjkF*S1kEV0N@^V+#Ve}`Z@%NL^uC=KnGUu7!N|sns~PFuVEu!zV$JxtCgxL0 z3PqbgD?u34W?*t`nBCLL3hG|l$0nW~P8~9iUOP?E&Xyd0^>R9N!@y0q6gn6-1fZP| zpU~;WKkwPa8jzAml}OVsg5ZboBLE{gr8#Qkl6rzEQvAn{O7PcvUGZ=0dt;0vSugw( z$J^=L{g?GamOD!9q?xcW@q%#7Smaj=JBZVlZ08yy@0W~_LlxIsD5!?-uNvsjEhg5l z<-09?nBdWh?>Un|EuzaH$E|H!zN&2Upn=2)-Ft!#Jqx@$l6$l&lOr!|_nrBO`@5WJ zer%-FG&{EUhyT|^z>v1kBH_!0U-~r_YA9g;((6V6G!MgU*6m~B%~GC8!BKo6vbLs@ zYemtubP!jxl=ky!fZadjLMZ7?Mg#~GDj1Oy3Z0*GB6GH`;xRLwfXDJ0{_QumfX^&R z{k1!V8?hZ{$$Zod$RgtgQjRgNn+N*dP+thV=9P93v(+#;Ho*8U&>|5Zt;&2perl~9 zd8l^i*{qFjU#^z{Rs@eaIaFKc5Jav2gajUp z%YyslO5Re$C+yS?xo&CNap7K!W>N?HVywd^vM5Pv$kYuF`@$f3*a z8XT9P*-9XP;e400PQ%k5|3XGHVS*Q8TD&a$1aLD!Ie8eJ$;{Xe09WAX$W%T~FPkud1UUzBu5P&jx)7?QgnV*lLebaDs-t4x_r z`?8X82P50?m#bY~en2AS%p+JQQQXP}Rs6kk&hgM5{mHXzDs{Qd4#!RXfFrPV@cy8u zO>uI8$adL;SFyebFID(i>y945v^G%2_$yB*h0y2NoFqh%$eH4GY&S84edHbbFOl8P zvIBYHj?Kc!Vg@2t!4Z{dgcFN^3fn>b3|_^k#T497;Mm{LBs5~XOo_vNNZD^Jy^-ir zWb0HE8hmyhz}#nW5-f^Dh`e}G6+6beeO@(jwl)YI)OAIV3owl7o5|@?-4HVY2kYsd z4&T7QwBj?DM9=OqV((YD1L(-TG6W23-}Dns_uMP}H97isz=hPua{E?X&nUlyuF{

vPPbJCNBCh&cVvO?n+{8i5jf?5l?sl~D+eHz7pP?Q8?r2CJSRcyYqxiEn2i1j%fqSOZ*L#Aea%E82`I~){`*U%q zTub3^W&Aj{vzAirM+&?08}fr{fwO-c)l@2|Qnnf=^P@LtF}p#g47l!KA>+~XYZ~MF zX?z!!tch-xFv$%dm(aUW5@7S}e6HY-b(F}Ptk2VAy@!16;mZv~MHzx{%YiI-R1}=> z#`}jb$kwSM1aG)R_7cNVe#grGL9J6uA~SIfWq*6abn<0!al5;S`8zt{^F>KW>*4Q* zR6yZ0nWn|>V19J)A{ITXsj%C;RxxcrVW$}Lh}X8ScOQV*bi8j<^?s7BvO#tB$=3F& z{I{+xQb=X&Wx3cZ_uU0 zpA8?GqqnB$94DTGU{NLEMwM-uY?hukQ~!jMf<6Ta{E0y6o^XDVeq^5V&rT1f+3ymP zr2Wa++{j!b=(wYM?T{?pchJ9`C70wz_oOSOmEMx75Mtwv=VdFviBv07K#Q8VRzD4= z79-q_1ihHVvU#1+y`O%`t)hyVmz8B-Q{3D4wS>J`s`F-<+&`Lm-9YA0X``hX^RAg< zN@w{qK7n0f^&8*bGJ=*qQ)8SdJBSn;*xES~5^2?X89EmrPyoRBUk^eZH_z6hG*&6G zS%+fFdI7w#oc=B7Vq>)4kiNI!kXI^VS{f}d&0t1gtpL)RhJQnOO|LP;V)kmDLgc9> zIjsN44Ak#?zm63DH;HQ%!6zk$ z;F2v9RWh1fWI^%-u;gh(=6Z6NwKBBuxh@rGEXuq;{L*PK%{Tc^@~GdW_&QQq)mZ~T*n$6hkkYEdFl~(G9>F5y$+}{21*g8V<(rH_6c983!Ry&u|OPUxDUZyRKwoP%(iN(Bzcr)#l;2f0!H*D{Xu8 z&5WP&g#&|j`{w>O?o^4iZmn-yzzQnZL?LU1xq9;L$Yv|FT&3hw7buKp_^11N`;Lmb zR;ZjKxNmrb2c}{_z>Y5C_b56a1q+`XGa_-N@4`=k$hv-d72o{Xhqslf@qhv=CeQ&E zDf^_Y1X-W4&p%a4aw?fCg(tt>Ju_MBadSke=q&IGspd>n66v7RM`;Tc#SS}}`lO~dQVy;7K5tTCzFlqblx>y+oXm5a|u^V3A*2s)R!f-{PcDV8K|Mz zYNNVYXn39=fczYa#kj*qQr+!s_%bpti!Ow+|c)22A`|H>z>kYOO1#cyp zg){;)Egd{=B1ymMI#9%?6YTCg7X{IONNutAESE~L9{T$>GHQ`a8+Mg#L ziMJ0`=FSMb1f*~4*@4UWc&BOMOg@G5(B?l9ja|FSxFV$77ijf&-U(2Za8+Zb`F{wp zL@ek{ii;tQ8ECJY$5x`FUvBt*%RaYzs)NX8+_3~oc=PCDeAgR=8br0|Cl;Nj^-M4& z?B752uMC3~IHq-@+k9WODYncuAZkX(Q9L((7VD8ZoGwyY7!m??MFF|6*F%nvMammJ zt@8CpoCOv#QHae;@gULM2)^EiZJ~u5_c$2B9x-L=k#zX<>HK@2*X5=9KPOT=pH_Hi zkh#P6E)-oI`Qh{D$Z);u?YdVHp3RF7O+-bJoXdB>lzLdr-rs|-^I_j<#@V4OTRW3K z(*Mjq*2xo4onc9+vchbBMX(?V zv&(LBXYg_MHz`?Aop;VgkvE>E#p$aj%ranR%>AUE*w**a{dn*X_Y=qAkbK98dI5i) z@?IS%=?YnzuWL`mz>DJ|C%0{z>*BT}K`-9`V%PO+H&ZV6CvNfMe7|=npqC#fq!y#iBZymHMH-K@lw0U=2*A(%kSwl9ZLmQ6w$7W&1EQ# z-PR&GJ=g_Y&cY{8Jg%Esk8t>pW1po*F#R;=)ny0%f${gK#NpgxUp<&?OV8Y^Q572)`x=>9yeWv1 zphwO<@OX8rhL-fA@ce77ktuescel^iD#O7ex9;&{=Cdgtv#kdT&S`fK*+}qp@3dU% z@>40v6jmQUPVn*WAY4|HqNU{c?F5Z8f-YWz97k{HNB*V*MH800e1H8mc*E^ujH^oH z?>v)?%?%G}#96ki8}$zG@xBQd7R$R*+! z6TC~yIPnlZ6S}J4HmeYm0yOb!r)Bzf#KAX+#d}ya4EoGE&O~_L?+%hiDm5=zb2^2@ z;`C{JR|W_nIFt&WQz$ovDuCM`?x>)>o>W!OiCy=V9SLxNR*;}||HOhOLcD<%5Kc&A zXA4bZ8ClI;jv=~Igf??)eATm|R@czx=8y*+tt{BNA0jalWW7#t<>azN0igui zn%ruq4qd4ho2NL?h+OjINXIX72}N&2PETFN=b7SS0LN0@DNCB!u)q#Hz;BwM(nx=4 zTGoj!rRv+Y0TV538z%TAllX(m@Vi6Yh{gL;vXC=rx;q4F@#T6b6``|hvqRaTDV!A7 zS^OI4W^ysXhnv4SwN@R(O9$vH8XpnK$^+k#bJ+}&60140!x|X9u~?grtH$D`P}S5* zZ@ryDn7=u+5k_2%zq{#6&cTjKqXSE<-XV#F%aFtThhcX(BF5MA*{#Ob(nbmS-ec9g zLHDqnewkQxWUIlMMo9OQcc=IWn2T{xsi*i{t60(f6V`RcAv&tizWCL6 =bTNB5u z3Lgf>DPjJF(j6Z^Tv#i6kJyOg@*iVZ#0iz2cjsEib<*}&d-u?780(o9GvVZ8FvL#+DSXyv6J)(&3P< zVSl{!ArEvsDX}-morVP3`?DzsYTe)X*1NdRO85JYoe3+bVL8_Pi2|`mr3SV>?q_BE zjh@C;92e@<9OF#Kb)0?fPs{kT&vNtO73w1s?CVc;pX~_rSIUmRu1qS^kfTWPdGTDR z)Ezupx7|g~4``jK{xd+}e%`-=T&7B2j$AlDN!$!D-)kUJ1Q?p~4&*Mz&i)WrD97_Q z6d&E>y2%pgGeQ^e6`QGG%|hJkxf<{Vo7spt*Hld67Td@M?%EJ^%l7R6zT>6qTX3Fz z{Ex!+)si_=MG~~ukKg;xhgj(mZXp7^N~ZSv*O78`zw%gnh%nFZb4Eg?8^xxY#5c_a zShSaj!MJhS-+#(3(|P2pBJ?>VZ9U>cy@X_pm=nnFtuGU zN-k_h%m+>Zoy~i^5(b1DQBEvjU13n8(7h6&bX%^vOFicgW7~_1?izdLAcF;bd1|}0 zrgXHj*!&z9l5wuKNJ?{hhJ1adQa;ESKTHL&snixnwFJKBT&T7bRhT*gW2UyFOQ`c| zN8fqis|wtaM$)s^+zu9&{YQfgT3TQ>Pl-5t{eB92_a)P+;$^oEpF0!GGg_dsHs@|3 z*ShYd{m~)lw`@mEL3pcVaT-H!y(Gko{#`V$DPj@m;)Fyk(>I1|1P<)k4fgm(3`+4e zcs`HciWFd1(1N*2>h^1E_Hf3^+!3k*GX7CO7TA*7jzr?6?g4bBi!K4 z_x!Jj6Z!5StP*PY{K2mMxPc7r!=Nt?aE57G?0b%&Cu7lP85`GLxH7iupK#*J_9wd+ zK-`b3B5;U%ksO1fofJ+A4|yTBYkD%pffaHZ0eecnUc1-z0On{7tDthlJ(ygJUg)+` z8Ani)1fesZf(evwLeaE%JiHAM?=+sF<$6W5?aXEbL=fWa{x`~Yu;LyPNr@;J1DISU zf4|T@?}Tl=O?U`7&U#1U5oLvaU7Vl?IX>-t2+?wU;c$z=OxceAGIpGQnd2if7Yi%) zWFkPVDt5^_kUK?$*xp)}ck8;iN5tP*!E$mpp=4>Qxwg7`FCRq#=mD)~JTtR3 zG}$AIg&0_>2m5lu=+rnWK_@piQ6u!8T>f^h(bFAPFL>Jxy+&4jBxdPKnwo#;nI$^z zIv=uWCW#e0ua(L1KHp-Gbp5=rNL5o|kqT|x{v7-(VVX*}?+Ltz^)Sn%L1#=PgNm0t z-Ga!<{KqEiD_xrAR9N2a1P48ONu3P!dKbt{hMsHQH!=hMra}m$)AD%mvIjYF z#f`7vE3neElfX{H2Y3b7Sz@#?%L%`b{gPS-y6iX_p8K7?aS`Q@#Mh@5?)EZHM23}( zJ;%F7kuEzhp&sDIXo7yW+&e@ZTdCMkfYTJr1}e_QQ_3$X!r_T&=(OP2+;*=~J-@Lz zw;Z(Twzx%p8v4_AnM8YQLg}{n%zzQ(ea|##KC-fbOB>WP6tU8TxGb`RG%& zQpNM0hZ7D|7J3eO%A9flv^h*z{2G}*&`WAj=6BiHIPiG)i0+0Rcx+CsDk4`0I2-md zzrii$MSi5+M_>Q>K+JMH@-I#Mw~NqM4<~5%lhvtrOmiAd-lBZ$wu&+$ttzkL^FtSn zw1$R^LxY$aLB`YRA);}`WPe@lJJ>`t?s-IQY#7F@3Gbz?VJLH{^BMq5bPo{uH*azL zG2DNp{U5P=-8^{Kc+;f9xxUQeYu=n45UyeRO%tO)xE`pZL&`D4wp%B7JZAJc#J|OoXW_T&;jLGaNO33H(CP6;mG9Tgtmj`{h91vJloT4&s$&Sp z2|8fBGybzLxUs2*=KCkS?-K)~LTBovs8_-g zre9}8eFO9ir1-=9V%m=&ugAA)f90@_>uWRSO^&D7VEw9OY9U=Dw_8lZnXNV8>YF5o zD6w3v!-hibGhhxdzs&Rg*Dlu=1zz1nzd^PFz(HDO0L!wg6&0HmsL%!EN#HVNVME6L zv#}Vu1}1S7z7_9V8yG2BBB!sr$R!_k;VC~BxK`Npm=%(WE|Wktb%gx4b&m0ByR7je z6`k;u%8Q|O6JIOmLcFO62I03xbs-Pr8mW-J!9@L#LU$#E63;~nZqC!u}TglavQoJ^9N4I;AMo@?D<_fDv-8ER}*MaoyUEicNSr4?4UDd zY1AD3m$7Q_#K$48*jn1&pvlohzF?uI%w~+<&O1E#N1Z8~DNr_u+5~qkU zz@O@EYv@Ys6%0jY(UQw*x%#pf?6<#$bQUVQJ{6CP^87E5hy0K4h+lhu+@<5@cqdYA zJ$ol6r0FBU;V11Zv6SfDRp9MJd0l@qGdF7aJTbY2jYbx|vvFNUtps-;A{7nQHDnHw z<;ITDx{k;cN(qt>dA;r?#BfM>bj}Y^*)`<8_nkpuJU&^1m|X@ z)~P{mA1)$j>JZ!~6E7nDdqA32yeNs1VaD6N1Pf*TCj>sL&k zjQz1!aPq&wS5>x5zHyOw@ocz5!M=8PD*JZ-q#(`J8Ht+S;6hWZg z@OXvNwiv!$LGIE~VN3*bva(}^w_abvmi-|Er9tp-sl~9TE=yC5Swr|NFb1L8wf!T%MNszt6p4h}#H^mlVC*yRjW2tU&X( z>{B7F5Gk2{{=?V4zQm%iMhox?ezEwDx3hv*^C_zf89AZlZ~&Bql;q2P_g}g_JTg5p zB5hX{tj#(MRH*x>S0HV$=U?{7a0*Q=LI@?SM?wPj^M|?|A1Tev+`ZNOIg2a;YCZ`c_7H~LQ z5sHVX2_033m1Y2lb>FN;@7Sd%H7v|ZcNuL38pC>@!4>^)nJ7{|;g){XF1 z`SbN>TcFRJ!rtziuSV4VwF~a!jsnwx&cCyy>nC@qgN&iaa0t#mz@_DicgP+qetERASq}!c*bdPx>pc=llA3UuRMVU$+(rw4?2#6 zc~eN{sNfq$?4xGx5jluQ_RInjtFr9#`pGG7*JZOTNTZMpdVHS)mXLqhp}t%N!;_J{ z%bxrmC1$fx@D+@+vQ_e+=Lwx(uO>+(2hpf8_LwXZYnnGus`>;2bK%B~ju&rOdzL2U zHcfdf99rYHue75!B~sZ-pzpT{9k2-an^&?$D>E>Bo05acx25?e@wj`31;zx!lM^QL zo}9SE zivmEFtCf|H-~hf{ic^z-7YM$?ZSt(REV>a!vZc} zrA{qqbDHwfX8K#(&;C)&vZvSGZhq@QTh1;dBN*s9+Ag#p6c}YkJ+oadOD=X!DEDk@ zy&k-tYvRP;df+sB(c|=9VZL}X&Fz#qd!An|lHJ4m>vmLPO&&cw0X_&d(9x*9#UBa&@pJ`iy_~SGP zqeP-_;QX$indwsk!e2zw(NH^skrIb@&_U!X-+%fg&nhlLWt;m-!?m=wC+s_;jtcW8xFY z1kck;y9&b2Umx2}M0}%vg9b;XCO(2ps5`#jbD@%!$TRAhdQR<=mh*-aM9ju=2#~{0 zGf0!vO90?sZJptGD`g#bWyVp#aN%YltJzqpNV6NEl2=O@bnfhTn2~K_+Ii8ML5AcU z2JRmd%=(==@69bL^FJ4KpesB;Pc-W`X8b1Wb*=EmSZ81d?XR2lf7hhPqvO8`mhw5> z2;E5WYiYB+sF@yPAL?18+!19uu$sYhV+qg^=YsH(O} z7EL+?X^;Y$Tha`fLUR&iu6;*t1Gjq$RyLtCvK$PxOpC_$G73l$cgOs|NbVkB`180T z+?r40MV9GlLh>LfxQhZG_cveN)_)_5d6R~|!sItwp_@OXf?GSK57;q6QdqGs=o2&b z+9ems(LTL?MWMs!Q;hPB=`rxkF8}8GW38cm9vDw8<`s!^t7=n9F}My^YvA|u8fW_7 z6yqudVP;>T!#2Cz_dcat_tr*zY6(RX|DNLP6ValY?>B!>^S~Poz&ztXn-}6?Q=$qI zg1+8H-XPP|d*%R*B;jZ%txYhCO`ktsD=+_HqzUn-uVafoX3+2!Eyw1}VRD*J{v_$| zTU~)vMh~!c%LlcxjlD}OI(~0LVsqZj;PGO9xklF+tS1Zq6y5QB(q9SvyfMf8BX3F_?xdpJtjx#IjDIglf!IAo*1bO%`PIe z_io026FoPifOu@yJKRcHY);MOk|v(-NgK18#g4pglz^2j-}%OiR5VFONSr$;{V`gz zpuloT;3CVay%@gVl5KMP(-fj2q;Ij~^)qzQi@}Gf*_g{rSxDb>r`Q$5x7@lVduy)g z?Y%8K8R{a+NUN{m${ifOuf9gBxj{6Hj$W~Y%C3ElL_uW4J;4EPJus~tXCdEh@(R#& zj?n{tW@Q&|d5b`1!;3Oyr1p&Oojx!PtE3HWPWhhh0wzTr?w9d&m6m+H{&N~rZWpco zDTGl?)bhWxK`hQuxA^z7a^|4vOW5)#MgtDS;%RgdhQ;y|rp6erB@Nm7O?P)A>5e-c zzKwl=lfkw5%=KVm8e#Y~?5>=}ql@4!%P4tKuWDwbH2VJ6M85TMM9}elS|1kdV=uw; zHQlJ7RPplF{I16qkdf2ZtmazW;<oJ3CTOuqSLO|E+TCGP=yrdPwc?{+N}=WR+~Fr;XM>E`gaQ-!TPZ#^KR)Ch z`%>6RyuPY;dNu=zQ@NM9!uEBgMHy2e-NqRP`XXEq!mX}0*A6+14#rgETHS4JKTo~j zw4P;Rx~ z(R?L!14oR9^9%eLg@tnyijwV0nNwl7 zj4H&x?=Ce~*lwKtXlw+oXwcqaGnztHwFM!g26^CqpAK7+ zjvF>Abel&^hq#o1a>qi@SHV#)|J**K;)sk9taP)vO3v?x(a$g`Y%ek$fqagAPATh! zGXQn~920~Xf_HVivUkhm*EIYzqSdR!mHVU@mt%KUVO?DQ^F&P5&!QM{KC$cToza)_Q-rpWF!wiwO{W-m$Weprvf!N%Us|aVfc}eWCo1`PsQ|8P;JjYXyxqTX3O2H zMJW`VF`JJ~V1s?X4eix-1n|hT{=zxdvRK)U37>BnKrTdX zPn0p6wWv7VK*q_^ORp#SVBSD9YPX!m#Lkdp*bW;_K*I$!nIPu`v?ObaX$^2c7hfjJ z{c;do`Vs@Ij^rlf7Qt=Ts-IPT2lJ5YgMUsXkWK0~G|)a&-)~2UoHBBP`y6;`PntQK zyk2-aIZftcB{%t_xVVb>+?dT~25^`0r{p7jvh8u$OZd_^^S@z;MQqP3hVI$)L;H~I z(uklkyDCB&S0qmrh4*~vZ`VHHHE3=v$MiRH<4y~@7GxobX#b6Pvx>)k!mn9X(?@Dd z(^(ZT{ePmrrxQeEJ%x~4WA1Vcy$v2FMsPZ}6PbR_Yi0y#-yeaE=%&^OKupz`rG@T} zm-hiEiLHwp!pK?SN+;%qqk)#iX>qX-i?ty^Xz_+=dT#ys_Z55J2MRk}853hgP_46I z<&isaj0IDZA$k&y**(MqUSt-%-!-i3p=4Sjs9Gr_LM1|OqPtMRUS0_4pIYj}F5U0L z=Fv>>WkvYVyzAt z)1=`0RM2vV?_3P7(~Pg$-Q)zZf$EY*kZTWz&T@!8)Ty&Q9 zh3G*uIeyzVmy2N?Kb@e_V>U&u5>)*Ky=@SD`uX9Y z)b@r@W$uvq5$rV9X@xeh@gK{;p!@aNb7|lB0eQs;51ZiQ?3SOkiZ(ojwcF(z$O3oM#nwO1d+X9tlHFWfGKB=_px#%}dYxJOslbKDH z&yFSCTV6Mr7hOWL0O+%!M_tO(KcFC@?!1sT&i6b7@L6a z4uN^6LY+hcbZoW5s^l>f#ABdwKHkj9Kid}nDfY;$054w432@I7)a*E6ouG~{$j$~9A=X%zGl{Mw@*QM9Ah5)5OJ2gMT@-D6vGlJBX=BQNbGT==J8s8ZXV-w3Ta$$zSW9Q#O4)eO)0xz7 z&6MAzaV=WAzI{vEQcdU*xdnDB<>P;ZPzRaptN9VZ7H}{Kzbw*5rH|RXLaIjt_4xQX z893iU^iqfC^w7Q_u!RPBc_Osm$d)|&(xssYsb=+Mo=_RE`VtiX=@W=1gJayPuw=dS z1_p@5Ba=sv!zDlHC+fRBgl*cUJHbWIG6s{BU+YdSs|!XzqK-D*l@o1~-5%Tza274p zc^0Z<;UfD?d|5X45P5ivQnYj$26poVp}ne!g969Qnx6Wc1&plD?HQ0L6f5wBgrNT8 zrC{?gIte4`X9+-^?DSjTluYI1Ts3bn1*Nht9Yp>07c2m-I7M3IpG+N@bbvuk)L_&- zWPaM4EZN9ITEBdgL^bt4r+5IiC%1ekS?`<94K&up%{>hdu2ZgKX%BTXID5?O$z*5y z9`JFndA1Dg#yJu`JbJ+VpqDH-zoGA|tLVzcKT5I7vh7ioirzkwm3WdclarJ@o=W>Y&9=fNC>BW?~E)V4G9&KSj)+ME?7{UCkgZa*h+p!KY=_L`qR%uN>UlLs}!FH9b14V12Co zt+9#YGtC}fu~4oT%%zQw?RKK!#~tmzq&MsNH!<^I4fOT=rbpW@RdUr!8HfK@RD{{( z)?kJdd#9~^EsRz(hKL1nu6_l(@_;!lHZGj%!5X;TidiV3&3}79nc*oirXZF-p==i! z*jeIody1YF_HOBT0_`-~Bw}QGG?r9H^_+HFW&C%@^S-|pRUPBS$J^!6L1=xYe-^!5 zxZs1TgYucasYjV@wH3ZuniKzm!XpO0=?AIJyh5D)9x4^Sr+1n&iJ=o^x``@_82?c} zQ+%A7@gA&fD-Fn;D~@y+5*YSHOD93+2(eeaQUs~9rs?RS{Aj!cWdda~_z?En*p=(S9a5s{Q2~QJjvs~$~F3}nUCe2{nbq5nE z#SS*e@dHYU(e39hA-nI1R0sUQOI!P`)7hekGZ*6VP1>ERpqMtV&5@@sw<6AFgFm~v z=WISs{$~*p>0X?-UEVV>FenT@wD;*GY*TUj4Lhx&eEL-+<;l$cE~XS?I(DTXblQ@STN@DS`e2^n{z4_= zYmM7gB>5WiU54J}Efd7FlCgYs`j@h{4n&}hSmNk&@Lj8np&DDa;FMM?*}4SD4SKbT zG;_Lh81wq#y+@6LOk1aN=ihVbTQ$+~xZi{z27s!%_K!Rl#@(y(bv9oV->?_X^!V8@ z+cReNCgBT;^t33lw2#HQInr;(zT5hzKaESupmjfQ2K7b%IsINYbB|!DbmBQfkL;DL zQWrh+&QAIQrNhXrtmoC?cmA>c_D>V{39YRq=bm)^GE@A7j?Bw)9L z)2N_5)_0ss=HmLO#uiSTQ$*xnlOnZ*6lHpud0@QA?7NkW<^=OsepqEp#4kIS!nrsJ z%8-BCgM=E*9rF|tg%BU+7v)DJT8Nm5%G>4F&1QTpDe)D)z=q;B|A5P1NHc4-M(0K? z^WSAI@u9dev3WFunZ_JtjcQeIX=wRODW{uJhub}*u>s(~)|*iG-MY>sgj| z%6-C|tM9t6gYj0u12=u>j{u0TRoNp8W-AmawwGc)N<0qtL70&3A-Bm@#MM*hwp%qp zg8;ccYqg6_5?RgW0>|9^5MPkxx~o$S;w3{uV?uo2cD=#9sGHc`E}svO z|Jt?X9-pn)C^X2c7JNuT#1!*znJoB9A{>$wMyEIA-MiZG1OS* zVT!JD(Amod&`kMk&+;UjsrxdAoKYfa=yVI=nn5099K|rT1lmYHsi{v9-~gy$K$@~` zy2WRIPkgk3$T!>~;gG?la*Gn?B$3?4QXC(B2n0fFPg5OzFjH&u+0*n+`q0JNT2LEW z{E1`cI*{s;YBeseIz^*=&_4Sa>HCkWBtmj@gG!Dr=>`U?>!9%JeNP&4218g2(O3M zLhO#OE^DX>&XV>z-3Nd9akp+{@Da)RjojQM?8PFUZ~fEQHU6tBFS3G#m47&G7bwm- z7UY7ldazBXc7qH>{bf(%&FV@_3iPxagltbJm@33(-GwH4GMnYF&IUmVqxotb+K}V_ z8K@5xsuc+{iu5tqt+B8Ky;d%P~iTU#SzBrnQr;&PCS$Hha^ty0heWw0(B#!7Rmg z22Jn1&nxfQ#ofg&lo0!De$>#+@WU8|@iyDqb7_{oAphXlfcFR;N5%Ui`MCF@o6cOx zTTZ)4RXi>d0d9mx^x}yXZ(ox7c-K#cpLOB9e*(~*K`GEp#($294kWL&^;4T34%Bd- z1Gqg6gncMHhg||EIoH4t(Su#*pr~cbD6x5d&&nrxJRK7GZE_|&PZRH}K)|hOX_X2* zrYxt|qdGXp;^mO9XlO3F;*j@b-f^`>+Vm#>_xEjE3FA6S3AV@6ZyWyWb2H?fSHZ$y z-7597CxpE`m;I69bK|G;BGIBfJe%)s2{bp16=VC9eOZ~z2*U&~OStTJ=dF?t$^Ru{ zOn2~xN>)1ddrxLk9A>#wq>1-5zV^J~u*u9m$%}mkq?0*X(z@l68NO;1A`qzS+bz!8 zH&pSR z-M!i=!IVW^bJgLyl!Su`r{w2=s`93SG{^1UTc=1p8%;(bvFk4-1i<-hk-wUxb90@OfK{4*`p-YzoMh$79*>fk8QKJ7#x5^!as0j zcYyeQiZOivxXmy)fSDa)KaDHi?k@*3an(8?A~NuiOT~WV==;mZ9gPE1)sMW0(@lG8 z=MVB#!m@&6CZlGIH!W-kcdyWfduV5Evu98ujcOBZ)~dL(0%C}Y>;b|Kwv$-o!5V(@ zJyyQ)P}B^2B7PxlIR|S*pH;hQa`y@MIvgD>hamk^o8*@n&^9hk5F19J|A5ub@Z2() zjDU!+9AjVxJGz;rAlg;;XW-d{Qi`fZaFK2i@?SOLw|VoQ7tbaLZu6eIEcA6;Rt5h0 z9VSYosy?9!vpH!SF;f~jDzoTPb7-t2tIBlt9A$hI)XNBLuuZ5a2>sc-#SONL)Fedd zv+}0|9b&S=g7ViMBE=(Fy{kLEeKQRYSIM%|vOg&^Gm)Jgd-+ibU4_=q20UB+P%mM2 z!4i%ru|Coh%vWj|5+}pGNDweIhxo;TVXk@_ycVmfIx{&ee-)NU8{0rR+~?7Mt zV2UJbj{8HOw+A5XLyZ@xogO-{^1w!>5S(0ZCe`U{7SY9pYOl17 z{X5BYm$?mmH;0bLsxAN z2swk2Cyl;+8$G<}U8US*a1>xMtr}l{BD0oIAFSS)^X%~@!d_T*+ewdYN!5^TIFOoc zXh`bjkL(>jqjoV`+v=Svt6N7}IGjS3MeQG%ZDAnmcF1dBcKt`I^S2 z?PPC1jpSnTYfjPwdUNZ+*t7tZA@C8OERZ?8vsF+hC5k|erSu@`-sl@6Z5J;LvYyX> z{<7b36Efz3PsiF`#A>UY`Rz_N*BQ6u?T+MpryQC#n%a5KN1WblTG>*1G{5mWG)L0gyrS*oWHFn zvv2L6c(GqE?VhDVCwY8?FaB=kjFE?o5kwyY(r7Y&Gn8}r)i2{BH};U$7PdUgVY;>Sp!&${8=bZMQpADQ7TJr2G4} zYRx41Gp~qWT&jHa?bCmZeSN3P(D_@a3A5t~FdF=e4iaPB?JjkGB-ck5exE)wMnuuA5rj9|*6`J8na&6!*R`XnM;K-MrLoQ>IK@r~{ec{c8Ztit>PzkiWEh`ckY^xa1 zYrO2Vp2SmJ{J~p(Rf=&BX{FC$jZ>yRG>5mnz1l~JmC0}y+Vx^ZCx@!?nmOWcF!@$* z!pl*VO|+XIt^O)KDiOf%TYfn4P&lKX)x$}zR5Sj~(eY)<<=!(^MQk75ng3n91W7@5{Z z7T#)(l4H7aqs36YwM2a^@fP=sEt}8XV0LSo5;#tJjw$MG5U-&fQ;Z`;`a3F|>Xg{n zH}XpQ0s=18xh&?y?gJ22Y40Ywd~oJT>Z;+xhW>{CKKaG3k|_wEHtRgB+Rg5`(Xaee zqQUEEqD3KIKWAuaJ`$aOw@r@d{>Ar)?=f8h%dJgugjT^JK$ZTap@s%Wa+L-4wwT}- zm2S_tQ_cfv%Y7HVgAvEOg%Q~-OXlLFfq-V9e!J;Xv#RX>%cOG~NlwVs%pH`4#$w6+ z&-!`ytg4l`LumbKl|P*scof^{f9z{Q@4^xOZ1OB)+i5FTkz+df1_+2| zM*wCuj_$4 zTr|mn#Vltvp|HJ`v5H?`D-^1*e0|KJEf=pAq=8Wk&2B8@JEj)Fv%De29KF0sb$UMf+6N%+;{uJ2RY6x6yOQRjU+ly&Es`-~!RoYWC1QTc^! zbWzV26*voB3lM0$oYcJi-9El-e@Lab(GP^;ws^7phCb6vr2cQ07^8uXu0lzkx#Q&r zf@|+xnNyxMC}ZhNvzKgftA61go?51M6|7k^uR~f9p#u3R&j682XgLo_10E<{C&k2T zmFOPNK=mJ(H!fjxKt6|F!~vkAn_VQ|9$a|8FQMl8B|dx-woAN*-W}_U5g9&qd&O+G zvr?U`pS~CMknhagH_tFS>n^E`_U5-_b1>)oEP7AW=ekDWcTrY2p6w(?q@Oh*t@iTU z+KmTAV9cUdA7yoBL=gGn-Pcfb31*Qmk!+n&qG@gY7*LVo4Nbel3nH;vT(U=HMx0mqqK{yZ**mH5IE&Bw3|p5FcLe zDGEr#Z`Gs6708~X?ya=_m@^*!`FpRiNzQR9;`NhSpWf&&)t=h{X?E?65Ao%4>FJ>R z7-Yu<(VXLUe_Flg7rHg}K+C=n+3`oLQ_@E^1B5dBu^!t^yM79z)_&NT>rK)ev_@m_ z6ykR=M`BTMtQ7Hc*R0_M7Yu{6!EXtGo*j|uL(p1%?T5lh5PSEG6rzr;*b?|QUVjU& z@~LtVoEIzoV6mr=t~aNvj_2{ii|WNCZGpwFcWMF+^{ZdZ$i%pryD+8bZx~)OJ$w+% z9tF*9x%cyJm7(h+C}XRcNb#U`gZ}$IP*w4;P!c){uNG&aJS`h8!LzMtWS1#YvA=|_ zmRGG_jri$_VN<&(jhTgY54;I$vRQ2#W6M;S3YT)VLtVwexoc!r&xo&8ZxW@*caB6O#_-(NAN*hvR zW9qsg?A5>w_n#h=q)o2-eHpfv_dF4lno&PCiS2`2d*=zc`iq}ou6x!gcfBnegg;G6 z`#iLc=R-cy&coGDd9A(IO+94Vd^q*F{<*G!0MUJ!MqUti6>ek1-M3;Rvu0xUO#N@G zi*ZZEvt^}mFl%2ieXn@%@=s!nIdI^)4o``pEldSvS5+5^s|}lCE{N1;fMCk|cn|>z zTN*nlR0nhg6QI{ML$ji1sw4Xn5crjYp|c9abmy`jC^e+py48JJ@cfk~grUnq3{f5| z+c3R~wt%9ab243IgDKtR{jXhRq7FEHu^dNxAWj$ zo1F_&n*_4J>$2@PGV;FrNW*&uo5IqU|fUrHw|a8A4~Xjv5k8>w4u1&=SdlHiwz(04KcU9@B{| zLv{}6+6Pm?wI$=3%p84X1`S;Yr9UblYn)R7PmA}491rnD`M?~SWbSrE^vfCcj9S)PWp&4^h|dlhGnfQl&i=fn(<%0Nsmn}0nz@&cP$r;&Yd zYva5__oI3p%WYFI5*@KrXX@$%%8Smv2GPi4f8@KL2xYE~%f~WSi|KcxwZz`N?!U1! za=Tj5o%hArsNXNE?l-clu8{W$4#!6Q@yhqh-s3&zCiQpP*@k)@jfGu8p(q*ithsW% z)8S=Qms#d`ITO0~kDCQ4u7#$Or@xW5XkM)MX52exV6oqn1TN!tjK$}JK1FL<*UgUF zfy=5~6C?_G|ACwvc3mt6Syj+W6mtMw*w_e+ZS~!=N;w(9z&bAUO1()8a2wC?!w?pR zaJTmALF?Ix7oGZ(mrX&sPW1br%Q%TT%b%Qw@tKCNs)-`fUq9Mq?pV!TplQxFr{wDG zn)M?@QN{8qw*e03eCVgcJJ6Z)I8*uJ%gb}vl;uPE0lIu#l>$?H_P&5nhf#U0Vw81lV6-I&I_k4@<+-kTDb&msP)|e%AsGd~8Atbd39_Bzk%+CpMxQh5pU|g(f7u z92NA^u5Moqjmw6WG*+UMs8Km>Nk3Hcu4LKS8ZoG&umEesIr=pb+QzBv(O&gCm|Sdk8seAhJM z!Fv2JmKJpwExMw!Z^Q~Lb~BqO!p)2yGfXFOi=TDCY3&CT8@SJTa*CFITtjJ;wHJ9(n+y3kx1lT_SVL96QL;r2l}x z`|_Xu1}`iEmAi4BOWK1(=mpLseh~?)UNwJ}CAVUusxlkqm+rsvX74Ga?C;qE3OT1Z zplG~5Mh=mcc>wy_b)BVlsHw!4?fc+kZJW0%lAC{~lj3)L1(f=nYPM4zCrfTb0l8pz zso`Y2GHGz}Fu8ByavJG#eqVq)pqXh&c%yK{F{r9L|)&AzI&^9Q~cAd1g1ns=i}$v z1;Yiy7hwr|%fUP3$a8wwK_*D%rg9tptS45G9nJ8%QfSw5sd8$G4|L5HV-CtXQ?u@6 z0o8x=lp96&{8v~36|Rcd7Mi`8sBJ%0Cm!xg>F!2b>_pbDyc9f?bQWQ?5czx?oB-Y2 z{}2B@G1*|JFqL{y-?Pg#<|*zkB9wjX3MzZz-pGHs!GVW8z=*RdcP3Qn!n?J{lLlIm zCaNG9Z~6L-kpCCUwx`|^cPz26as5S0hpTp#AWJnHSR z&#BZd%QKiFIcj1}B60z|F?O<$L5_)>N68ZOmt>BA(D3vx z9t~2=`6(wREnF38OJKw~)+9*s7aV~q(xp0NR_PYs-D{@GHtg)^A*({YN6?uC5{0G- z30|N6(JYxL-{=5T#URlY^?X1oiFM&(-mB~+c( zXTYq)wOSO#*UM}i2bOS=fTHOHHvJc>Y46_! z`s>x*VFD5^DMaV~pr&fdvGFzZdVUTN%?Qe9?3U-xu;{BISUcmJ_WZtIshLbp%CEpH z69}wP%V&G1?gTbBgUUj~M}K{RtVaAlX(9hT+_#`BYjD*`yj6lZ5${}0UdH^zGg>S4 z{W+~wZbl!791?`kbH!>iCCeGIq3dm0N()Bmp&=FXK_g&sUfQ`y@E_$C&i*14O*;>> zzGcmgOrAZ}re(6SVn;&~9*jRtXYmQq z&ghR|6Rs-~ga3B1mS)~`waVmNxZYV$C@#JiAA{K9r@ws|H5qj9u+6QUMQxC=(S-!J z#hO2Enb;hTI@#)dBnl3|J8>bcqcY!mJT3lO4>`Jp8)!l~GLNQQ6OsnMrY!JEQxM#0 z5aZ_6je{YBn|5F0B;yA|X{{GU5!JkNBDJnR29SD>Fv4En?xGWXkB9Uqp(5s_navZ% z8(*i`Act97AjqyHpk+R8Q$t+KF%&B2LM?fGzHBz8>TIC^Ar{ZNH3^+R@rS4@&p?`h zIKcoTk=cw(Of0YQE#}FwIK7)RCVxL~`yRI@?Zrd)A4tY-ltQTD>a@Yfw*s(QDY(m? z{g)g7tZrC=OZ(2%1N{}ys4)5c*6vQ~Y7~>fLL2o&&==)C#XmuC^m<`}u8nHiA<4wv zxco**kChI@V8!L%fEgl(FsF)_t9QyP$n_^q;lk!DKup!{TcF8v=`oK$Iz84wHjK*w z(2uUFJhER8>EKoyxu;6GPZ?M(q{Gezw09EtqP5|d>(L2P*U%Y36wY4?YA%yD-xDeB zk@rt?PadUzeCjImIO&tL`CsvHGK&gBKDNTOm=`Fzwd{~CA(=N1;1r{mem&@%WGBU8 zqGiEc-q~LyeZWP=lM0)j4eh zw{F5DYM88)$h(f-G+lOxFDP>)v)FwKl`>(CF+=6hJr_lY(BJ=awzBr3(>b)DutAhL z6Xxs=@~2+V!MEGFKrlq=>{zA8M$3KYYA~)nqUeMf=d2mL(Mg|0kLz_z8qhC7Qiq5p ze)GGtiPK_Qb+UM!LA0wHkyKF_N_Nt==v&6Oh!EFePH_kZJV)WOuFTL+=41>{zYIpi1Ef$417VqHgtAan6>snF+$POzLS|N6u3@2Zf3f z(#SJ?o;hj_W^IZIRd}t1V zEUrofpaxXR0MgUn)~JHXvQcZ`Y_|RU z^J8J58(Rjaals+8vQko8CL7Jv=%N0CCx{E5HqW2(=$Dx52@>9@;USldt>3W=)E-K3 zofgUy=seurTlXsiUI$M>2$%`?ewsc>44H+CHuiXO!@_G$g98QBBU0o#-D=RIZjtc< zLB|gv^je2NJ9OMm5W*_actsD-XyW!G_cl|tBjszO-%MmrwqJC?iRFGqD)OT6wz=+i z=V!{l;)Vn~fm_Z-BQ~Q(1M{3@)5tCpbSq}u+?MwfwgYp9s6pK;sFK;*;Kpfn4Y0Nf z*a~==gyi95YdLK&`IhcLDu>C)YvZ)Bys~?sIuJjEd z$`%p8LkTxn=Mr5N*o!}u#Z(FhEkfI{Sl8zKeLNGypjq^WnWIPDtf6=B{r^7=_CP`t z9j*~k`TrMU`b=M)iO8T2kdJNstmiY`&gv;LGvDRj;CMYZ{xm?{MQ#Q6eTrlNtWL5-&=p+V zyN&?^Fv&L_92(ei2!`v9wP&T33Db3N$D&r!QQ;pZL&4E&ggx%waEiPlsm$0e1Awo& zMG6x*u`{nOzW4#!|MB2K4VrI@Of_{cwuXo%6fEt$h*$hES5!lsvld32-v}T>;eKrOh3!bdk!s;ElGWs`$Hn79@1Y}oc=KP!-?0> zrUE1w$*Ey5{ z9D1Pe>7NcgD1ZtI#zZNC9jZ|;^x)VyO%cG_z9S)9bqb|Do;W&ez0Jg27_jG zrUIC36NQ&k+mR@gs@g6V#(toL)9akSdg@AyJQNkN=>_x!z`a)~>N{<_IUC(=g#uS- zahI{nJpG5lU*7w-WAWbigZ_Qz6A!O?D6NC+%Dd}-+>fG_u^eW}zI-dHu9auzF1bVM zuixVlbAJ!(&FcGl1;7yH;T4jJyAN;uv&wl5TCrz3G|?rofLQFE)nSi08{>QB&aY!9 zys}fdP99Ca`^V`bGW-hkI4IQ=oQrpxVFqLZnVx^F*@VUuC(Wz#UFVC)A)kq0_V@Vhm1O8_K?bPY zHv3PYaj0_V(m%j~ud(2effN7^B8gBX^P4N^QkI}(FyVQqN#=!vt21hp#;WQ=W+hvs zN|TR2)1W`(V8HAu)7?NfV8Y*C!B?wK3FH*xn+<(m^!C*?AX432LI@5@L3+nfmwhhx zQK7m|05DRZGhr~Rd~6iC>DM~9`c-i4?htRnmOH*i=HrR6$B<5 zW}*A2?TvXgGZ0@(ph3keq=8;}|63z8^|zhZUwF$vVA36iHTXZK7$NA>LxWrR=D2Ht z%axCDgNil~8~4&>$9ZnfMGeR$dR~G0an!u4NXYTQdO$q}xH&+uTgz$V;U|L;%_%@; z92Qi!B)q6FIqfUHaj2LztDR(0E!+5?!?AltK=i=M31cx2y;*XIYz6KbQeelb%l8#p6nAT%%{6I{*gxRb@=e~ccv*D56zJ0 z^L_@V>9Q2(pJi2(6$`iZl_q8f{}gpEQ6uOU)Jao zjBBP+iMDfRu-IkSG0Th3lF@Mjbgqbx2gT)*SPc??r(ahRePBAea?seuS!T>uoI;Ap zimvnj#~y_Q_6lg=Ns?R4W{!uKj12e97UBg1fq0Xin?KClgaj4^u!ndMWS} z-X#UpYtEgVJZY7gIB%V6M3!KGbki4?O#?oy#B8DY`k51Cx1q^~E))94?_$t{o%el< zcJr5jCza;!Kfbvi7JQ+bxGdL3zin8~rV=WL`;)bN@Y3AF0!C2YdbvS`UH+`2{jo-q zfj#4`2IeJcv*?#^rB&Z6WZ+(pMk7Tq^;&av!~WG;MlZh}jrz2!oUd5DXA|^7a>2`l zk79r>bhBW3vrmy;UC%((M0Tfsiqr8#U#f_XHdOT9t&$$ZoZL-XS6|$n;zpFGvZEb* zwvmk@of$M@n)pz~0ozb>V4T8;Ua{~1GmMA(I;g;vz%a$q6rf?1$)U@O!Ysemjyj5l z&AHVxz_~t|6ksr zLG$(Z-U|4vQMty1lmqNlnC1}w9rvnpUw_}s_hYYquxOP1g{22PJNIT~=JCZIw!w_- z5s|pqdd1PH{*}r73x!jz2qRd~JkLhe5hS>){<6UC2>ZtH-wNP>`?o?F^s8JxN?x4& zd(a-Ocu1bQXmNvGfoF)3b=WgqyMxgTxWQ*sbvQKu9|zA{aK}Yh$#soDN>U6QYMi*- zlGCzXH{(~EsSZ5g5ADN4V(Zpr-ggPuNid=-Da=PqJ7ws|Vb9L)v6UP#bbwXuR%p-e z?EP|aQI#V1=!eqQ5I=6nog-F^9khdo7g=7Z1TM>KI^{?vG$x4D%|%_82Hb92NUMGmFA#LC zH3u&f)*|J<>#{D&p(>FdL3p^$3(|5IN~>Ite*{Lw5cx%G%%**fDTMk}EDajztCn#$ zn$wEJ^}_hRYU=Wft2md+r2e$vN@x`Su2lGiNY$W6cbGqvaNUi0;VUm>Ro$f12v+;|&v?K{lv} zF&ReiO;XQpMt4Y%lg0P1W@Q1s_xG#14~~GLzxkgXyyu(r3-^k~P#M9ujB2n4p7FsvCl1%&u2+oXXb1MjXUdHFA;*KW`RGwS5l@@LZ(8- zXSZmvWA>GRqM(!kSP?QaRFqM70H@pKMh(X|@n19BuHkgA!)bXe+*4rkZ=qds^$ROs z;fB(=QOX!lb0;qQw9iFe^j`FOO~t_kJ?j|k5qR8VHDJTy@Z`h6Zkv@;?)+x+Uw`k) zYlj;K;e;@o+MbdJZiaJ3Q?y9+t;>yTA-eq5vfCi3w@unxH?g@DMQ3p``&6XLsvpJv zXjx+}-%f+G!HqeScPL1{a{Lbo9K*2BV5W?j?CNx|3F3an&qNHkf>aQ)lantx;))1u zO_F2J2C)7^PEtbu@1Ndk5Wn0l0XV7LyY=!>`tA@n`K|aw^vv}$NbmFCWmng;3aE8L zvKncQTtT7QO|C=M?04kj^>9SABqR>G2ygs~o*6?~tF+Eq>aSo~A&Xm_;)rUVZN9yP zqs>OUt5$l`U0}q~=Tv>vNOfYEe6zNd*NVZ^deWMNu$d(9ZS}WZjLcvB7DyRnE0CM< zT{MD8%^JF*p=&ix>&3rBRfDQX6t;Irnnd9o^et4bcxJb?cuDuk5G7xk?4HYNEvjzI zDydXgz5rhp-~BnTKtAfUTF*fta$2t9;*dH6fzTOu(#?ytZ+>}4kor)o`nRA+a)rsZ z)MTSX(Tc!YytaY5&LQ9~{azKylV*bp9vI%3@ozMe5(ZsN0!3Nick+*i3m5hrN_r=_ z+ed;AjVMPARCCX$aRn59s;P%v;my;q@2Wp|S3aOT&yVebGp&-`Bm7EZF$|Cr^~_s{ z$!}?2j_$d;Nx8pj(f7tmt=R-Ep42XwC{dK)7z?9sS%GuTU!M8t@r*}vE z)$~v6%*OTi9kVYqkN0Nzt`OLunJ}3lF4pcXW8D z$bn7c?*P4}ZStP($Upr9CZVD?=jswrNIxk|fWZll0p8i?&0yw;*tbe75)bYfrB_Ww zp5NBIVR8Erm&lZvbjMxk=vxY>vv`zzxHf+jlz7_LxgBwnelGnha9WiQdX!6><306+ z!L)9egE2C||2C0yQ4C%#YZY1{=@1ho26xS!4RJGsmiL^fHbB@Dqw7#{;~aIi~f zLcL6OubFbiIHBmA$3yCw_HkHj>WvICiQ@F|KOcJM2Gy8kD)yh zz@F)x;m{#n%;Bb%Ez%UdHiSkZY75t=l;!Ti#LCDHGoy1ax^_I-oQ$r6qI!(mryI-*-m#-iL#FkG44%_ zScgx7yB`Hm4r3JJvb~^3xDgfVHs&1vEPvnQ1|%2hkNapE&jd|4$pw}Jo7BUwFNWUF zB!AJbhd+mX(ck#Ufj&_l-qjT8e?IpW#x+u7F;LP04%76wi2SvQw4Z(V#(Z7>{ z_s%$YPscd>zVe>uY>-I;yM`I#)H7;ar$kNcPji1i5gS+pi z>87sQ7{iCRZfq4;>k_KIIttY5rcOeq@^E?Y*-K@UD?SEsFJJH5RqCJ0cdg_o`%0g@ zrR2-L%+!Yv9VCS&ATvSl%Qtrg^u8Q|zp2;zT*OB(aUL@UCQ;)qwf#=Bfym{2W;YEg z8_qu}@e9$;6nkoKral>ZGI4YynL8)0+YO)8V= zK4bXp7QDqqpAGGn8=-J_p>e! z^J)++v?`vCj;A8ego~}J{ll2)0LC24=~Y_Eqi~<>SKY+Fz(RvpZF@hmtjGU@_{|zq z2P~zJAk)M96f0?Qjl()xyhTW#kk$!E;Xmp!0Xo~AF4B1w7>jWDm$N`>#(r>RfJ~#H zhk=kyZw0#yux%W(k!dIK&SeAnD<~LSmNfturk51;oR696UEj3@33w1@|fl+~lJh0-1Gr`%X+}5^UkEli1@Rs3nw-(Q8>0Zg^9*TKv@b6P#smBN&-T3^>Rrv&*aN#Xbo|$Hjo%bpI^6B)%VF^ugyIY zm+z}l^icaNtq9Pv0BDScglOe-@|G=sU^x`Uci<$9NcC{5UewU}q8B~OMuQzRxfiPj zu(!i-5{$p0HSjv@(1p}HK3Dz6F3fj?Y@PCm#Tj(jK7fz$DaqV$IS5NBnI*RW$5yr9 z(L<6M_&4vKHfaAl$CAMgtG`R@jVKiYuTN1>=!)Qd;zT!gdpb-Zn&U05cTzER z?Ay6YzrSau7Yx+@%^Opz_-a=y5CZ}oVY7|>OUq~#%j zbQ|tC_F$TxGc=MRZ`rT zyxCq;eL8O(I2tZ5INq3|yMCONdhyZL*LeC%&x!ob{S70{hrz;iuaw|h$^p_h`p;Ld zJfM6t>ex2L1Ke4W*H3Fk17$KRqB*C*TfJ@v4#QJOH?V(}4*rr+#?)J_16W~qN*N#^ zbsiC;b#Awk0_t>ZOVqQ5YgQaG36Fo)1Sn!R)T*JYRUfW819$(q;pZP~2okpc1X%^& z3sM>DSf}qgrFb!j4qox)om98}xtp+yRQX>Y=>)u-=wmy7vn;RYjkfuR266~Lu{R4k z4ph<{`ZSULP@S>0hTC4!w>Ht+Fuj`+Ol9W5dk_~2G;;>*^3}<5z@xhA!6mm%L_q6e zgFdzcMTc%3fBgtBtiG4fp>vPH95VJMy@D~jsT)Z)Xi^HlvJip&mo2)I%%cDh>I_hq zAhc2=oiU5g#lthl0Nj(p-wq?b&3Ogt>DK;s#q;mm{^xx^tjUROhoB`Z^^dex?KPp} zUX&u;VGi*1ZThlyQ1gG-+S1s2L+<~ub)|q^02exH)fjM#OYV^@{Jl+smPRXoRd=|< zo5W-%L4{$T7Bcu765m+a(*cFd&iOKcV?)@-1cO!`g(eWmbJ##7FZ2K=+Eg*H=z@ZY zeloR}n>Y{&=iJjY_zE&Z_lM$0A_LD2GPi-~74J@#33V52sv19JhS#oA#<-S8Dv1im zv&7IS0XRLIBvVvhK~%}G>qrcql5jUKIOSxH9Jac)s%C38)YUNcx+OWRY?PsB7<3+p zeR}V6s6?t%xyXpct!+K>G4#p?r=Qdzt8&UmmZf+*Fc*0AXuXCX6*RC9e7v@J7Q&1% zpK$SPCu->|!+G3vUs@k~FtHOk(3;h1F(Tdd{f(%zdt7G&3Y`&tSLORgN~`^XWi?9Z?CM zpwzJzQ!uGQ;{V2oEYa<{bp%BvA)uLP`*E>sVJzrfV|wdToo-oGlisq&^;4lV z6?@3H!m$gkvx+CQ->1AC!;{hgyG{y(Sh*HTKWbdnsU4&wrH>C0kmf4ckR}u)XtJv} z&CL7D!6voYls}xb;hm(F}vuxdTMpi zWzr>jXUF$UxO!;c0~l+Zv5Wuz(M8Ppa_v~2z$;4M3hQL8BQav>)qJ=7u?j9Yeir?=;Xf)8_FC=)`v`CpOKX>1~Dcs`$S_2&ev>P17vdc{;q?ek3PvSoRIUg%prkNQpOQ>ws^wN9ROMw!CMUL3#<#KXBj`fL1C5!5 zs;LZz{3kN}wI)tUNPY8?xwpV|9~fD74t_AyI1;NmprV{EAFh94Xg+dRHH6c$l`}ZwkNxC1Jjr1-p6^94tXo^CF`Lf;E1oj$a;G5)4(IQpUEDe~Z1h%2Ve-;?; zaYTn()xJ+)4M!BZat-<-KVzQZ#AC z2sv(nX?9|r$wYD|Q_<+_tF@vR`GmsoagA!yLDb|Mz;6JTZfg%2N`m27%edM|plC=o z*M=Z~W@Rx{BHbJo6z9mcspsjL739HKY_h2%@#9uXwG*}5V&GGtwiBqsc;={U_LnY< zVA#J(RvsKE-o%@%YRKE~qM3e1o2w#MqYQsr*E$YnsPNX^jCzlja2YRFGmM)ql&L+8 zY^^VFVm#P$Lbh`I1?H|Eyu5IpZm~L?*~{|E@xnK*iusSEpIq%;KDH2FGth}G)i|T* z-y6IF_np-oAt`D-xJ-1=y+&*3`B@p_hkexRxqV#7A7HWk`%v1HHKJjLF0)aKhW5J1 zF!m(uY(tHn?OPU}1Ww!C$=>T}b4Hg#%+;9uLBrr`oAgrp%EImKC1d~O(L+^R9GHws zU>3pOmb;1jKpTVWNI}q&Hai&zjd)r$v}fIRH))8vE~3)Uec29m5Pp?l+PY3a`H(u#nCV}pB*R`U-D&*I z$j>ZASi*c~bkFGUp?y6m&Wn>?CT~DRuNvhr*)r zgZNoqY{eMr6UK9UZuH}`C(2h`C-9iRQo3Ktggowf#1qT^Ku!WzmPGGBq z22J{8s9o3Z*o^F2;(RiKgI2J$R9g3_lM#dA${xR4)*bReB3>r>g#Yl(ny2u9;g@3Q z0;BNEPK~LR&_KPCRHw!Swye46gH~wen0`^KXb4Q=#fF19pD9wkZFR2okye%plgbXE9=TSVw0v3(&Z^7JO+jMQUzeaRDCOK-ApY7-n8R84 zo5vj5cR5a2;9S)avG#I7Z3iJfeJ&+qU?8tDViQB$(w#1jC1pp=nWHo6s8jKqu9y4= zOJvTzeIL|@P+JcQnTugeS%hWT8eVX287}z7R6%{;=oDFLoVrtF;$e(i%p$`~b)l`k z6~bG3UB_FHdTTmn(Qf;=C9o`0H3$8`G2yzEr1CGaFSZ>`_$&P+0b^t zbHFTLI8KpNLTW=VT>yyMuT0RoPcj;8N6F$qe4w^ErAv+pZ^JsxnIJu z7o=ci+$Dlkqw~N0lG#enc9#sDUJ5ke_Fbi|C-7F8gM1eaE$d4bt*;}UU~Sr2(D1kSGV4s>uzjGpv#V9Vp54UxY7z=bqsU5s(5nNysu5au_C*wc#6zEqw6=O zt+f=$aSlNWj*^%ulS9u*)kF~%l?|LLTJT%npY5kXR4VzyIbQdF$lq6!!=`;M zA#6SINKxKkLidPLsP?<{uoLte_6HgEp2GudlzJx-;=wA;=f-^mHr%gy&Xrclu5$|5 zKz41HD%$_uq#cr!c)To8unZefC`>82JUpxI{%#rem(FwAd5qcMWxHqd5xZ-BoI#Hj zd}ak3ChFgo>g4B!n_dDsX=_0r5Luxm>r@ z5z3bhX|(!s*&=_;q+q!Tjk8_$T)k1BR08qCt#~Tno?|JN?wSTY|9lx2+QHCk0as)$ zrpT18ZF2I9dtyw#d4BK?Hr1;hmAx<14y*z|3L^;jU11S0zDROf=x&qS8 zYAaeRIEwrJND&BgL!t?!bCt&a?0%CG2&(R7|1cT$a-X*!GyYfBm4LIcy?wN`)RNk& zt?jn1)=n2&Nff1Rv}!AAE7cGhQVG!_6jirZRVkqj4O&|jvBkawml9%MgWMvtBn`3D z?wj7{|NXz`x!=q)GiT16^Uj$$XWsYszRx@F**!wRev)^j%|)A+O%lf4srd4^JTM-G zEpJ)ZW)#Qx>NF2KTFb>+2nc)KI1`iS2t zuqC0O)wcbvIVrUSMx>SQY?YgqF;pH8^h+N$_vwZ3Ic{ggkT=TtLr>g@y@MTbxZbMR z!r|Oq&_32tUqk)(lihlw)fNnGZ5p zI}SF)u%--3)Xsj2&CKU#tU>I-?@I{zh5@MI^43>KNYb9>&cMp9Qva~;s=-G@h z;NcFs7w(mcI23+xoa!;igD?0fqbu`jXPI^l(}`*Tqhi=b!4Qv#nZ-#?y7#lxph?BS zQ7i4Q)rN$fqg0Q;)EYZieR`MssytFl`@q@TpOgW_ zW-G~D8!EQ5IGIow*)WDBB&v0=dn!iuDikzhZNZA+HEJRjqTp}RIn%yT!o4~m-%{JU8;W419Cxmyie7Fhb29N*jP;gH~bC5eHN5yAc(-jp0N z`(A$eN`ztAw#1@}<9oqyJ`Cc!=+G@6DOs{e=Yy+w#Tr8J+#P9^gR9_VN4Ai@aXcVDfVqMZrnVDCuwg-kI=%f+qJZIp1eJWune(Ja-f$7#R{R9B=5y ztFk`AhgeI(dcC$GHM*aZNk8r6FO_$}akYlW_SM_#07giou!wh~tKO2M7(19l&0ZiE zzvoH|2FIAi(I2i-R$ix(eNEjg@&+F<-kF(%q8l#$c|RHAt2HJw)nZ0@C{3w#)c2&Z z>uf%Y8JPOv);XFt6p~l`PK;&tqWZ2YVzQet5SCd$SI{54w1L)mbLF7(GKjWHfay)n zx4wy(r^k5Th07PMd6#PO{vMQ$kO$ng6q<{50yavc7z=0xGHFV?8)A7IM`z~P!qJ~z=s2T9eta2~_j{Rl zvTF={lKG?U`n!U;Q?|{yb(4`^%6qMLMClK z$w3T~dHJfcYJj#IUTog}Yb5sx6i*Nssr;h}j(szF{}<4h_Yb&q&u7)>N+|m6VB~Rr z+=68}*(<{)`uI`YsB|}XUH(HQb(*4?jgSIS;YQ5-q3V@_ zYE*58Y@ao*NKbVMs#;V(-K)55M%)eaR~%ZIe=S>K`GQU$hZ|k8si4kyG>Cx;ZQELT z8QaA-L_yLij~(%68V&0zQq8@hU@kCzRxCk+Y3d-Dr%2kRZj{9EYOV*F+i-p?7E68K@ngf}FLsLayj@URlYbo1Ec5Srk~h?k!1aW%TZG~Wj7YVhUkAzC;w zZ(CASJ(X1!H=KIONec+gtUrqEW3Msg%sNs^mwy|(QiTaL`Mx6(Dco}})lvmZ_aOd7 z@fKIFsyd@Ve5DH)$vCc7vZr^HwVCNHlwsY45khP&$^!le)32h7P$iHI+b#?bqPdKj z%_tkjl+QnSC|LAC5PA>Q<#jiKVXI^fOd+KwjOiXys?DG4@^1F>9ZJy7xSXRk|H3nOq|W@bwWqlAhm#`+PL=@PS}|_%2SlzIA7r0+q;hDYt@m*iKp4nvppW4u-fNXLxl>AWnxpsLK$GyF?WbNt`}W)&PGQ#_(AT?adH$F_Lf0q7xDgz0cfW9S3NG%8u_X9AavhdxCo1EPx56GEa>ylHxWdy0+~3 zmbS89DN-J~Xt4zeX`@5dD8@Uhj)tRrX|-MYLr_tW2p+{EeqC35O{h$0gvS2{;kBRe zSQLoB&DMylnU;?0gmb)0z}1^eY56-;CRtB-QY2WV zMX1K2X3N?dK+;Wk8svaUmi=-x)zWER=ERyND{1d?4fygzUyOq4O+J{5A0Mme>od{p zz1u97X~Q1(DzSU-la+&F#16H$>^l$CMi+K-iC?F8V0X=k%n2(HAM<6s0q$C4$}X;m z(*RhaNJH#^Mr{-$=o>g?i6*R8Vw(E7Jm_1)VZcqG)>NLd)TfZVo6YCw7OOGt)Vb6r z+q>{1Valu_MW=0HfhdUqiK>s{>k73yx-S$|-(W9JhKD%_Q#~x_gL9i6xik`{_^4Tb z55{^uMTua^8yn8&oZj1>$dCJxqKc2~3fO5~zZI+;>#IseZ!lw8l;13C{PuI{wExKE zFST+sc-p{i5l%TGI(}6N+F5>v?B*K#lo$q5;bnfV zjq#}%aOj8vzwR`vkO6u8u-`rV;H_6XfR_*or$-dF6M$#~l%M&t&5$oruy*AIgQHAT z$znDGQdw3vJYQw$<}Y93a#16$)xH}{hZqx$mbcxB>(d}{b@{5=x^*Gd&_J$Rc3thj z>jKfIE+#QnsvVpj$~dLz;?Am`KMC@*pUVPlFpwuoMeB{I(Mqh(p985kIps{6_vvf~ z`tebEj(=)S)7ENh#DTn(1vifbmju_Z#q_zQ{XdES>_GoLImUHh{OcqTh7PH9`e*dOaEOJ!z zBTz?(n~Uqj|B?ZYq{!a?PstA!=%L|@M*9DoO8Zc-vFZtKE`R>R|7uSGK=)tm`R`+C Zc%cKWEROL0y6s_Ll>*9&oC0=S{{sp~Xi)$F literal 0 HcmV?d00001 diff --git a/tests/test_acoustic_modeling.py b/tests/test_acoustic_modeling.py index 78943494..51ba97ba 100644 --- a/tests/test_acoustic_modeling.py +++ b/tests/test_acoustic_modeling.py @@ -1,4 +1,3 @@ -import os import shutil import time @@ -36,7 +35,7 @@ def test_basic_mono( ) a.train() a.export_model(mono_align_model_path) - assert os.path.exists(mono_align_model_path) + assert mono_align_model_path.exists() del a time.sleep(3) a = PretrainedAligner( @@ -47,7 +46,7 @@ def test_basic_mono( ) a.align() a.export_files(mono_output_directory) - assert os.path.exists(mono_output_directory) + assert mono_output_directory.exists() def test_pronunciation_training( @@ -59,7 +58,7 @@ def test_pronunciation_training( groups_path, db_setup, ): - export_path = os.path.join(generated_dir, "pron_train_test_export", "model.zip") + export_path = generated_dir.joinpath("pron_train_test_export", "model.zip") a = TrainableAligner( corpus_directory=basic_corpus_dir, dictionary_path=mixed_dict_path, @@ -77,10 +76,8 @@ def test_pronunciation_training( assert rule_query.probability < 1 a.cleanup() - assert not os.path.exists(export_path) - assert not os.path.exists( - os.path.join(generated_dir, "pron_train_test_export", os.path.basename(mixed_dict_path)) - ) + assert not export_path.exists() + assert not (generated_dir.joinpath("pron_train_test_export", mixed_dict_path.name).exists()) a = TrainableAligner( corpus_directory=basic_corpus_dir, @@ -89,14 +86,10 @@ def test_pronunciation_training( ) a.train() a.export_model(export_path) - assert os.path.exists(export_path) - assert os.path.exists( - os.path.join( - generated_dir, - "pron_train_test_export", - os.path.basename(mixed_dict_path).replace(".txt", ".dict"), - ) - ) + assert export_path.exists() + assert generated_dir.joinpath( + "pron_train_test_export", mixed_dict_path.with_suffix(".dict").name + ).exists() def test_pitch_feature_training( @@ -131,8 +124,8 @@ def test_basic_lda(basic_dict_path, basic_corpus_dir, lda_train_config_path, db_ def test_basic_sat( basic_dict_path, basic_corpus_dir, generated_dir, sat_train_config_path, db_setup ): - data_directory = os.path.join(generated_dir, "sat_test") - output_model_path = os.path.join(data_directory, "sat_model.zip") + data_directory = generated_dir.joinpath("sat_test") + output_model_path = data_directory.joinpath("sat_model.zip") shutil.rmtree(data_directory, ignore_errors=True) a = TrainableAligner( **TrainableAligner.parse_parameters(sat_train_config_path), @@ -144,5 +137,5 @@ def test_basic_sat( assert len(a.training_configs[a.final_identifier].fmllr_iterations) > 1 a.export_model(output_model_path) - assert os.path.exists(output_model_path) - assert os.path.exists(os.path.join(a.output_directory, "sat", "trans.1.1.ark")) + assert output_model_path.exists() + assert a.output_directory.joinpath("sat", "trans.1.1.ark").exists() diff --git a/tests/test_commandline_adapt.py b/tests/test_commandline_adapt.py index 9e97ec34..ad940502 100644 --- a/tests/test_commandline_adapt.py +++ b/tests/test_commandline_adapt.py @@ -13,7 +13,7 @@ def test_adapt_basic( test_align_config, english_acoustic_model, ): - adapted_model_path = os.path.join(generated_dir, "basic_adapted.zip") + adapted_model_path = generated_dir.joinpath("basic_adapted.zip") command = [ "adapt", basic_corpus_dir, @@ -23,10 +23,11 @@ def test_adapt_basic( "--beam", "15", "-t", - os.path.join(temp_dir, "adapt_cli"), + temp_dir.joinpath("adapt_cli"), "--clean", - "--no-debug", + "--no_debug", ] + command = [str(x) for x in command] result = click.testing.CliRunner(mix_stderr=False, echo_stdin=True).invoke( mfa_cli, command, catch_exceptions=True ) @@ -47,8 +48,8 @@ def test_adapt_multilingual( english_acoustic_model, english_mfa_acoustic_model, ): - adapted_model_path = os.path.join(generated_dir, "multilingual_adapted.zip") - output_path = os.path.join(generated_dir, "multilingual_output") + adapted_model_path = generated_dir.joinpath("multilingual_adapted.zip") + output_path = generated_dir.joinpath("multilingual_output") command = [ "adapt", multilingual_ipa_corpus_dir, @@ -64,6 +65,7 @@ def test_adapt_multilingual( "--clean", "--no_debug", ] + command = [str(x) for x in command] result = click.testing.CliRunner(mix_stderr=False, echo_stdin=True).invoke( mfa_cli, command, catch_exceptions=True ) diff --git a/tests/test_commandline_align.py b/tests/test_commandline_align.py index bcbf6652..db08fa10 100644 --- a/tests/test_commandline_align.py +++ b/tests/test_commandline_align.py @@ -25,7 +25,7 @@ def test_align_no_speaker_adaptation( temp_dir, english_acoustic_model, ): - output_directory = os.path.join(generated_dir, "basic_output") + output_directory = generated_dir.joinpath("basic_output") command = [ "align", basic_corpus_dir, @@ -41,6 +41,7 @@ def test_align_no_speaker_adaptation( "--uses_speaker_adaptation", "False", ] + command = [str(x) for x in command] click.testing.CliRunner().invoke(mfa_cli, command, catch_exceptions=False) assert os.path.exists(output_directory) @@ -53,7 +54,7 @@ def test_align_single_speaker( basic_align_config_path, english_acoustic_model, ): - output_directory = os.path.join(generated_dir, "basic_align_output") + output_directory = generated_dir.joinpath("basic_align_output") command = [ "align", basic_corpus_dir, @@ -69,6 +70,7 @@ def test_align_single_speaker( "--debug", "--single_speaker", ] + command = [str(x) for x in command] result = click.testing.CliRunner(mix_stderr=False, echo_stdin=True).invoke( mfa_cli, command, catch_exceptions=True ) @@ -102,7 +104,7 @@ def test_align_duplicated( basic_align_config_path, english_acoustic_model, ): - output_directory = os.path.join(generated_dir, "duplicated_align_output") + output_directory = generated_dir.joinpath("duplicated_align_output") command = [ "align", duplicated_name_corpus_dir, @@ -117,6 +119,7 @@ def test_align_duplicated( "--clean", "--no_debug", ] + command = [str(x) for x in command] result = click.testing.CliRunner(mix_stderr=False, echo_stdin=True).invoke( mfa_cli, command, catch_exceptions=True ) @@ -153,7 +156,7 @@ def test_align_multilingual( multilingual_ipa_corpus_dir, english_uk_mfa_dictionary, english_mfa_acoustic_model, - os.path.join(generated_dir, "multilingual"), + generated_dir.joinpath("multilingual"), "-t", os.path.join(temp_dir, "test_align_multilingual"), "--config_path", @@ -164,6 +167,7 @@ def test_align_multilingual( "--output_format", "short_textgrid", ] + command = [str(x) for x in command] result = click.testing.CliRunner(mix_stderr=False, echo_stdin=True).invoke( mfa_cli, command, catch_exceptions=True ) @@ -189,7 +193,7 @@ def test_align_multilingual_speaker_dict( multilingual_ipa_corpus_dir, mfa_speaker_dict_path, english_mfa_acoustic_model, - os.path.join(generated_dir, "multilingual_speaker_dict"), + generated_dir.joinpath("multilingual_speaker_dict"), "-t", os.path.join(temp_dir, "test_align_multilingual_speaker_dict"), "--config_path", @@ -200,6 +204,7 @@ def test_align_multilingual_speaker_dict( "--output_format", "json", ] + command = [str(x) for x in command] result = click.testing.CliRunner(mix_stderr=False, echo_stdin=True).invoke( mfa_cli, command, catch_exceptions=True ) @@ -225,7 +230,7 @@ def test_align_multilingual_tg_speaker_dict( multilingual_ipa_tg_corpus_dir, mfa_speaker_dict_path, english_mfa_acoustic_model, - os.path.join(generated_dir, "multilingual_speaker_dict_tg"), + generated_dir.joinpath("multilingual_speaker_dict_tg"), "-t", os.path.join(temp_dir, "test_align_multilingual_tg_speaker_dict"), "--config_path", @@ -235,6 +240,7 @@ def test_align_multilingual_tg_speaker_dict( "--debug", "--include_original_text", ] + command = [str(x) for x in command] result = click.testing.CliRunner(mix_stderr=False, echo_stdin=True).invoke( mfa_cli, command, catch_exceptions=True ) @@ -262,7 +268,7 @@ def test_align_evaluation( basic_corpus_dir, english_us_mfa_dictionary, english_mfa_acoustic_model, - os.path.join(generated_dir, "align_eval_output"), + generated_dir.joinpath("align_eval_output"), "-t", os.path.join(temp_dir, "test_align_evaluation"), "--config_path", @@ -278,6 +284,7 @@ def test_align_evaluation( "--custom_mapping_path", eval_mapping_path, ] + command = [str(x) for x in command] result = click.testing.CliRunner(mix_stderr=False, echo_stdin=True).invoke( mfa_cli, command, catch_exceptions=True ) @@ -304,7 +311,7 @@ def test_align_split( text_dir, english_us_mfa_dictionary, english_mfa_acoustic_model, - os.path.join(generated_dir, "multilingual"), + generated_dir.joinpath("multilingual"), "-t", os.path.join(temp_dir, "test_align_split"), "--config_path", @@ -317,6 +324,7 @@ def test_align_split( "--audio_directory", audio_dir, ] + command = [str(x) for x in command] result = click.testing.CliRunner(mix_stderr=False, echo_stdin=True).invoke( mfa_cli, command, catch_exceptions=True ) @@ -336,7 +344,7 @@ def test_align_stereo( basic_align_config_path, english_acoustic_model, ): - output_dir = os.path.join(generated_dir, "stereo_output") + output_dir = generated_dir.joinpath("stereo_output") command = [ "align", stereo_corpus_dir, @@ -351,6 +359,7 @@ def test_align_stereo( "--clean", "--debug", ] + command = [str(x) for x in command] result = click.testing.CliRunner(mix_stderr=False, echo_stdin=True).invoke( mfa_cli, command, catch_exceptions=True ) @@ -375,7 +384,7 @@ def test_align_mp3s( basic_align_config_path, english_acoustic_model, ): - output_dir = os.path.join(generated_dir, "mp3_output") + output_dir = generated_dir.joinpath("mp3_output") command = [ "align", mp3_corpus_dir, @@ -390,6 +399,7 @@ def test_align_mp3s( "--clean", "--debug", ] + command = [str(x) for x in command] result = click.testing.CliRunner(mix_stderr=False, echo_stdin=True).invoke( mfa_cli, command, catch_exceptions=True ) @@ -414,7 +424,7 @@ def test_align_opus( basic_align_config_path, english_acoustic_model, ): - output_dir = os.path.join(generated_dir, "opus_output") + output_dir = generated_dir.joinpath("opus_output") command = [ "align", opus_corpus_dir, @@ -429,6 +439,7 @@ def test_align_opus( "--clean", "--debug", ] + command = [str(x) for x in command] result = click.testing.CliRunner(mix_stderr=False, echo_stdin=True).invoke( mfa_cli, command, catch_exceptions=True ) @@ -453,7 +464,7 @@ def test_swedish_cv( basic_align_config_path, swedish_cv_acoustic_model, ): - output_dir = os.path.join(generated_dir, "swedish_cv_output") + output_dir = generated_dir.joinpath("swedish_cv_output") command = [ "align", swedish_dir, @@ -468,6 +479,7 @@ def test_swedish_cv( "--clean", "--debug", ] + command = [str(x) for x in command] result = click.testing.CliRunner(mix_stderr=False, echo_stdin=True).invoke( mfa_cli, command, catch_exceptions=True ) @@ -500,7 +512,7 @@ def test_swedish_mfa( basic_align_config_path, swedish_cv_acoustic_model, ): - output_dir = os.path.join(generated_dir, "swedish_mfa_output") + output_dir = generated_dir.joinpath("swedish_mfa_output") command = [ "align", swedish_dir, @@ -515,6 +527,7 @@ def test_swedish_mfa( "--clean", "--debug", ] + command = [str(x) for x in command] result = click.testing.CliRunner(mix_stderr=False, echo_stdin=True).invoke( mfa_cli, command, catch_exceptions=True ) @@ -549,7 +562,7 @@ def test_acoustic_g2p_model( ): model_path = os.path.join(acoustic_model_dir, "acoustic_g2p_output_model.zip") dict_path = os.path.join(dict_dir, "acoustic_g2p_dictionary.yaml") - output_directory = os.path.join(generated_dir, "acoustic_g2p_output") + output_directory = generated_dir.joinpath("acoustic_g2p_output") command = [ "align", basic_corpus_dir, @@ -563,6 +576,7 @@ def test_acoustic_g2p_model( "--clean", "--debug", ] + command = [str(x) for x in command] result = click.testing.CliRunner(mix_stderr=False, echo_stdin=True).invoke( mfa_cli, command, catch_exceptions=True ) diff --git a/tests/test_commandline_configure.py b/tests/test_commandline_configure.py index fc5a8c75..6ed11f73 100644 --- a/tests/test_commandline_configure.py +++ b/tests/test_commandline_configure.py @@ -30,6 +30,7 @@ def test_configure( "--disable_mp", "--always_verbose", ] + command = [str(x) for x in command] click.testing.CliRunner().invoke(mfa_cli, command, catch_exceptions=False) assert os.path.exists(path) global_config.load() diff --git a/tests/test_commandline_create_segments.py b/tests/test_commandline_create_segments.py index 65832fff..4c8e1bdf 100644 --- a/tests/test_commandline_create_segments.py +++ b/tests/test_commandline_create_segments.py @@ -14,7 +14,7 @@ def test_create_segments( temp_dir, basic_segment_config_path, ): - output_path = os.path.join(generated_dir, "segment_output") + output_path = generated_dir.joinpath("segment_output") shutil.rmtree(output_path, ignore_errors=True) command = [ "segment", @@ -29,6 +29,7 @@ def test_create_segments( "--config_path", basic_segment_config_path, ] + command = [str(x) for x in command] result = click.testing.CliRunner(mix_stderr=False, echo_stdin=True).invoke( mfa_cli, command, catch_exceptions=True ) @@ -49,7 +50,7 @@ def test_create_segments_speechbrain( ): if not FOUND_SPEECHBRAIN: pytest.skip("SpeechBrain not installed") - output_path = os.path.join(generated_dir, "segment_output") + output_path = generated_dir.joinpath("segment_output") command = [ "segment", basic_corpus_dir, @@ -64,6 +65,7 @@ def test_create_segments_speechbrain( "--config_path", basic_segment_config_path, ] + command = [str(x) for x in command] result = click.testing.CliRunner(mix_stderr=False, echo_stdin=True).invoke( mfa_cli, command, catch_exceptions=True ) diff --git a/tests/test_commandline_diarize_speakers.py b/tests/test_commandline_diarize_speakers.py index 54603942..466195a5 100644 --- a/tests/test_commandline_diarize_speakers.py +++ b/tests/test_commandline_diarize_speakers.py @@ -15,7 +15,7 @@ def test_cluster_mfa( transcription_language_model, temp_dir, ): - output_path = os.path.join(generated_dir, "cluster_test_mfa") + output_path = generated_dir.joinpath("cluster_test_mfa") command = [ "diarize", combined_corpus_dir, @@ -31,6 +31,7 @@ def test_cluster_mfa( "--clean", "--evaluate", ] + command = [str(x) for x in command] result = click.testing.CliRunner(mix_stderr=False, echo_stdin=True).invoke( mfa_cli, command, catch_exceptions=True ) @@ -51,7 +52,7 @@ def test_classify_mfa( transcription_language_model, temp_dir, ): - output_path = os.path.join(generated_dir, "classify_test_mfa") + output_path = generated_dir.joinpath("classify_test_mfa") command = [ "diarize", combined_corpus_dir, @@ -63,6 +64,7 @@ def test_classify_mfa( "--clean", "--evaluate", ] + command = [str(x) for x in command] result = click.testing.CliRunner(mix_stderr=False, echo_stdin=True).invoke( mfa_cli, command, catch_exceptions=True ) @@ -84,7 +86,7 @@ def test_cluster_speechbrain( ): if not FOUND_SPEECHBRAIN: pytest.skip("SpeechBrain not installed") - output_path = os.path.join(generated_dir, "cluster_test_sb") + output_path = generated_dir.joinpath("cluster_test_sb") command = [ "diarize", combined_corpus_dir, @@ -102,6 +104,7 @@ def test_cluster_speechbrain( "--no_debug", "--evaluate", ] + command = [str(x) for x in command] result = click.testing.CliRunner(mix_stderr=False, echo_stdin=True).invoke( mfa_cli, command, catch_exceptions=True ) @@ -123,7 +126,7 @@ def test_classify_speechbrain( ): if not FOUND_SPEECHBRAIN: pytest.skip("SpeechBrain not installed") - output_path = os.path.join(generated_dir, "classify_test_sb") + output_path = generated_dir.joinpath("classify_test_sb") command = [ "diarize", combined_corpus_dir, @@ -136,6 +139,7 @@ def test_classify_speechbrain( "--no_debug", "--evaluate", ] + command = [str(x) for x in command] result = click.testing.CliRunner(mix_stderr=False, echo_stdin=True).invoke( mfa_cli, command, catch_exceptions=True ) diff --git a/tests/test_commandline_g2p.py b/tests/test_commandline_g2p.py index 8b8d488f..7e7ea3ff 100644 --- a/tests/test_commandline_g2p.py +++ b/tests/test_commandline_g2p.py @@ -10,7 +10,7 @@ def test_generate_pretrained( english_g2p_model, basic_corpus_dir, temp_dir, generated_dir, db_setup ): - output_path = os.path.join(generated_dir, "g2p_out.txt") + output_path = generated_dir.joinpath("g2p_out.txt") command = [ "g2p", basic_corpus_dir, @@ -25,6 +25,7 @@ def test_generate_pretrained( "--use_mp", "False", ] + command = [str(x) for x in command] result = click.testing.CliRunner(mix_stderr=False, echo_stdin=True).invoke( mfa_cli, command, catch_exceptions=True ) @@ -44,7 +45,7 @@ def test_generate_pretrained( def test_generate_pretrained_threshold( english_g2p_model, basic_corpus_dir, temp_dir, generated_dir, db_setup ): - output_path = os.path.join(generated_dir, "g2p_out.txt") + output_path = generated_dir.joinpath("g2p_out.txt") command = [ "g2p", basic_corpus_dir, @@ -57,6 +58,7 @@ def test_generate_pretrained_threshold( "--g2p_threshold", "0.95", ] + command = [str(x) for x in command] result = click.testing.CliRunner(mix_stderr=False, echo_stdin=True).invoke( mfa_cli, command, catch_exceptions=True ) @@ -93,6 +95,7 @@ def test_train_g2p( "--config_path", train_g2p_config_path, ] + command = [str(x) for x in command] result = click.testing.CliRunner(mix_stderr=False, echo_stdin=True).invoke( mfa_cli, command, catch_exceptions=True ) @@ -124,6 +127,7 @@ def test_train_g2p_phonetisaurus( "--phonetisaurus" "--config_path", train_g2p_config_path, ] + command = [str(x) for x in command] result = click.testing.CliRunner(mix_stderr=False, echo_stdin=True).invoke( mfa_cli, command, catch_exceptions=True ) @@ -157,6 +161,7 @@ def test_generate_dict( "--config_path", g2p_config_path, ] + command = [str(x) for x in command] result = click.testing.CliRunner(mix_stderr=False, echo_stdin=True).invoke( mfa_cli, command, catch_exceptions=True ) @@ -194,6 +199,7 @@ def test_generate_dict_phonetisaurus( "--config_path", g2p_config_path, ] + command = [str(x) for x in command] result = click.testing.CliRunner(mix_stderr=False, echo_stdin=True).invoke( mfa_cli, command, catch_exceptions=True ) @@ -232,6 +238,7 @@ def test_generate_dict_text_only( "--config_path", g2p_config_path, ] + command = [str(x) for x in command] result = click.testing.CliRunner(mix_stderr=False, echo_stdin=True).invoke( mfa_cli, command, catch_exceptions=True ) @@ -256,7 +263,7 @@ def test_generate_dict_textgrid( g2p_config_path, db_setup, ): - output_file = os.path.join(generated_dir, "tg_g2pped.dict") + output_file = generated_dir.joinpath("tg_g2pped.dict") command = [ "g2p", multilingual_ipa_tg_corpus_dir, @@ -270,6 +277,7 @@ def test_generate_dict_textgrid( "--config_path", g2p_config_path, ] + command = [str(x) for x in command] result = click.testing.CliRunner(mix_stderr=False, echo_stdin=True).invoke( mfa_cli, command, catch_exceptions=True ) diff --git a/tests/test_commandline_lm.py b/tests/test_commandline_lm.py index 5db78590..c3ef99ae 100644 --- a/tests/test_commandline_lm.py +++ b/tests/test_commandline_lm.py @@ -12,7 +12,7 @@ def test_train_lm( basic_train_lm_config_path, ): temp_dir = os.path.join(temp_dir, "train_lm") - output_model_path = os.path.join(generated_dir, "test_basic_lm.zip") + output_model_path = generated_dir.joinpath("test_basic_lm.zip") command = [ "train_lm", basic_corpus_dir, @@ -24,6 +24,7 @@ def test_train_lm( "-q", "--clean", ] + command = [str(x) for x in command] result = click.testing.CliRunner(mix_stderr=False, echo_stdin=True).invoke( mfa_cli, command, catch_exceptions=True ) @@ -44,7 +45,7 @@ def test_train_lm_text( ): temp_dir = os.path.join(temp_dir, "train_lm_text") text_dir = basic_split_dir[1] - output_model_path = os.path.join(generated_dir, "test_basic_lm_split.zip") + output_model_path = generated_dir.joinpath("test_basic_lm_split.zip") command = [ "train_lm", text_dir, @@ -56,6 +57,7 @@ def test_train_lm_text( "-q", "--clean", ] + command = [str(x) for x in command] click.testing.CliRunner().invoke(mfa_cli, command, catch_exceptions=False) assert os.path.exists(output_model_path) @@ -69,7 +71,7 @@ def test_train_lm_dictionary( ): temp_dir = os.path.join(temp_dir, "train_lm_dictionary") text_dir = basic_split_dir[1] - output_model_path = os.path.join(generated_dir, "test_basic_lm_split.zip") + output_model_path = generated_dir.joinpath("test_basic_lm_split.zip") command = [ "train_lm", text_dir, @@ -83,6 +85,7 @@ def test_train_lm_dictionary( "-q", "--clean", ] + command = [str(x) for x in command] click.testing.CliRunner().invoke(mfa_cli, command, catch_exceptions=False) assert os.path.exists(output_model_path) @@ -94,7 +97,7 @@ def test_train_lm_arpa( basic_train_lm_config_path, ): temp_dir = os.path.join(temp_dir, "train_lm_arpa") - output_model_path = os.path.join(generated_dir, "test_basic_lm_split.zip") + output_model_path = generated_dir.joinpath("test_basic_lm_split.zip") command = [ "train_lm", transcription_language_model_arpa, @@ -106,6 +109,7 @@ def test_train_lm_arpa( "-q", "--clean", ] + command = [str(x) for x in command] click.testing.CliRunner().invoke(mfa_cli, command, catch_exceptions=False) assert os.path.exists(output_model_path) @@ -117,7 +121,7 @@ def test_train_lm_text_no_mp( basic_train_lm_config_path, ): text_dir = basic_split_dir[1] - output_model_path = os.path.join(generated_dir, "test_basic_lm_split.zip") + output_model_path = generated_dir.joinpath("test_basic_lm_split.zip") command = [ "train_lm", text_dir, @@ -131,5 +135,6 @@ def test_train_lm_text_no_mp( "-j", "1", ] + command = [str(x) for x in command] click.testing.CliRunner().invoke(mfa_cli, command, catch_exceptions=False) assert os.path.exists(output_model_path) diff --git a/tests/test_commandline_model.py b/tests/test_commandline_model.py index 1e8f9f4e..d98e129d 100644 --- a/tests/test_commandline_model.py +++ b/tests/test_commandline_model.py @@ -162,7 +162,7 @@ def test_save_model(transcription_acoustic_model): "model", "save", "acoustic", - transcription_acoustic_model, + str(transcription_acoustic_model), "--name", "test_acoustic", "--overwrite", diff --git a/tests/test_commandline_tokenize.py b/tests/test_commandline_tokenize.py new file mode 100644 index 00000000..22e2ec6d --- /dev/null +++ b/tests/test_commandline_tokenize.py @@ -0,0 +1,92 @@ +import os + +import click.testing +import pytest + +from montreal_forced_aligner.command_line.mfa import mfa_cli + + +@pytest.mark.skip("No pretrained model yet") +def test_tokenize_pretrained(japanese_tokenizer_model, japanese_dir, temp_dir, generated_dir): + out_directory = generated_dir.joinpath("japanese_tokenized") + command = [ + "tokenize", + japanese_dir, + japanese_tokenizer_model, + out_directory, + "-t", + os.path.join(temp_dir, "tokenize_cli"), + "-q", + "--clean", + "--use_mp", + "False", + ] + command = [str(x) for x in command] + result = click.testing.CliRunner(mix_stderr=False, echo_stdin=True).invoke( + mfa_cli, command, catch_exceptions=True + ) + print(result.stdout) + print(result.stderr) + if result.exception: + print(result.exc_info) + raise result.exception + assert not result.return_value + assert os.path.exists(out_directory) + + +def test_train_tokenizer(combined_corpus_dir, temp_dir, generated_dir): + output_path = generated_dir.joinpath("test_tokenizer.zip") + command = [ + "train_tokenizer", + combined_corpus_dir, + output_path, + "-t", + os.path.join(temp_dir, "test_train_tokenizer"), + "-q", + "--clean", + "--debug", + "--validate", + ] + command = [str(x) for x in command] + result = click.testing.CliRunner(mix_stderr=False, echo_stdin=True).invoke( + mfa_cli, command, catch_exceptions=True + ) + print(result.stdout) + print(result.stderr) + if result.exception: + print(result.exc_info) + raise result.exception + assert not result.return_value + assert os.path.exists(output_path) + + +def test_tokenize_textgrid( + multilingual_ipa_tg_corpus_dir, + test_tokenizer_model, + generated_dir, + temp_dir, + g2p_config_path, +): + output_directory = generated_dir.joinpath("tokenized_tg") + command = [ + "tokenize", + multilingual_ipa_tg_corpus_dir, + test_tokenizer_model, + output_directory, + "-t", + os.path.join(temp_dir, "tokenizer_cli"), + "-q", + "--clean", + "--debug", + ] + command = [str(x) for x in command] + result = click.testing.CliRunner(mix_stderr=False, echo_stdin=True).invoke( + mfa_cli, command, catch_exceptions=True + ) + print(result.stdout) + print(result.stderr) + if result.exception: + print(result.exc_info) + raise result.exception + assert not result.return_value + assert os.path.exists(output_directory) diff --git a/tests/test_commandline_train.py b/tests/test_commandline_train.py index 35122f94..1c3d13e1 100644 --- a/tests/test_commandline_train.py +++ b/tests/test_commandline_train.py @@ -15,7 +15,7 @@ def test_train_acoustic_with_g2p( ): if os.path.exists(acoustic_g2p_model_path): os.remove(acoustic_g2p_model_path) - output_directory = os.path.join(generated_dir, "train_g2p_textgrids") + output_directory = generated_dir.joinpath("train_g2p_textgrids") command = [ "train", combined_corpus_dir, @@ -32,6 +32,7 @@ def test_train_acoustic_with_g2p( "--config_path", train_g2p_acoustic_config_path, ] + command = [str(x) for x in command] result = click.testing.CliRunner(mix_stderr=False, echo_stdin=True).invoke( mfa_cli, command, catch_exceptions=True ) @@ -55,7 +56,7 @@ def test_train_and_align_basic_speaker_dict( ): if os.path.exists(textgrid_output_model_path): os.remove(textgrid_output_model_path) - output_directory = os.path.join(generated_dir, "ipa speaker output") + output_directory = generated_dir.joinpath("ipa speaker output") command = [ "train", multilingual_ipa_tg_corpus_dir, @@ -72,6 +73,7 @@ def test_train_and_align_basic_speaker_dict( output_directory, "--single_speaker", ] + command = [str(x) for x in command] result = click.testing.CliRunner(mix_stderr=False, echo_stdin=True).invoke( mfa_cli, command, catch_exceptions=True ) diff --git a/tests/test_commandline_train_dict.py b/tests/test_commandline_train_dict.py index d2eb42f4..9c1946c5 100644 --- a/tests/test_commandline_train_dict.py +++ b/tests/test_commandline_train_dict.py @@ -13,7 +13,7 @@ def test_train_dict( temp_dir, basic_align_config_path, ): - output_path = os.path.join(generated_dir, "trained_dict") + output_path = generated_dir.joinpath("trained_dict") command = [ "train_dictionary", basic_corpus_dir, @@ -30,6 +30,7 @@ def test_train_dict( basic_align_config_path, "--use_mp", ] + command = [str(x) for x in command] result = click.testing.CliRunner(mix_stderr=False, echo_stdin=True).invoke( mfa_cli, command, catch_exceptions=True ) @@ -42,7 +43,7 @@ def test_train_dict( dict_path = os.path.join(output_path, "english_us_arpa.dict") assert os.path.exists(output_path) - textgrid_output = os.path.join(generated_dir, "trained_dict_output") + textgrid_output = generated_dir.joinpath("trained_dict_output") command = [ "align", basic_corpus_dir, @@ -57,6 +58,7 @@ def test_train_dict( "--config_path", basic_align_config_path, ] + command = [str(x) for x in command] result = click.testing.CliRunner(mix_stderr=False, echo_stdin=True).invoke( mfa_cli, command, catch_exceptions=True ) diff --git a/tests/test_commandline_train_ivector.py b/tests/test_commandline_train_ivector.py index 966f52d6..d4b927fd 100644 --- a/tests/test_commandline_train_ivector.py +++ b/tests/test_commandline_train_ivector.py @@ -24,6 +24,7 @@ def test_basic_ivector( "--clean", "--debug", ] + command = [str(x) for x in command] result = click.testing.CliRunner(mix_stderr=False, echo_stdin=True).invoke( mfa_cli, command, catch_exceptions=True ) diff --git a/tests/test_commandline_transcribe.py b/tests/test_commandline_transcribe.py index 76f72fca..8a6fa759 100644 --- a/tests/test_commandline_transcribe.py +++ b/tests/test_commandline_transcribe.py @@ -15,7 +15,7 @@ def test_transcribe( temp_dir, transcribe_config_path, ): - output_path = os.path.join(generated_dir, "transcribe_test") + output_path = generated_dir.joinpath("transcribe_test") command = [ "transcribe", basic_corpus_dir, @@ -32,6 +32,7 @@ def test_transcribe( "--config_path", transcribe_config_path, ] + command = [str(x) for x in command] result = click.testing.CliRunner(mix_stderr=False, echo_stdin=True).invoke( mfa_cli, command, catch_exceptions=True ) @@ -55,7 +56,7 @@ def test_transcribe_arpa( transcribe_config_path, ): temp_dir = os.path.join(temp_dir, "arpa_test_temp") - output_path = os.path.join(generated_dir, "transcribe_test_arpa") + output_path = generated_dir.joinpath("transcribe_test_arpa") command = [ "transcribe", basic_corpus_dir, @@ -74,6 +75,7 @@ def test_transcribe_arpa( "--config_path", transcribe_config_path, ] + command = [str(x) for x in command] result = click.testing.CliRunner(mix_stderr=False, echo_stdin=True).invoke( mfa_cli, command, catch_exceptions=True ) @@ -95,7 +97,7 @@ def test_transcribe_speaker_dictionaries( temp_dir, transcribe_config_path, ): - output_path = os.path.join(generated_dir, "transcribe_test") + output_path = generated_dir.joinpath("transcribe_test") command = [ "transcribe", multilingual_ipa_corpus_dir, @@ -111,6 +113,7 @@ def test_transcribe_speaker_dictionaries( "--config_path", transcribe_config_path, ] + command = [str(x) for x in command] result = click.testing.CliRunner(mix_stderr=False, echo_stdin=True).invoke( mfa_cli, command, catch_exceptions=True ) @@ -133,7 +136,7 @@ def test_transcribe_speaker_dictionaries_evaluate( temp_dir, transcribe_config_path, ): - output_path = os.path.join(generated_dir, "transcribe_test") + output_path = generated_dir.joinpath("transcribe_test") command = [ "transcribe", multilingual_ipa_tg_corpus_dir, @@ -155,6 +158,7 @@ def test_transcribe_speaker_dictionaries_evaluate( transcribe_config_path, "--evaluate", ] + command = [str(x) for x in command] result = click.testing.CliRunner(mix_stderr=False, echo_stdin=True).invoke( mfa_cli, command, catch_exceptions=True ) diff --git a/tests/test_commandline_validate.py b/tests/test_commandline_validate.py index 5343373d..2fddb192 100644 --- a/tests/test_commandline_validate.py +++ b/tests/test_commandline_validate.py @@ -27,6 +27,7 @@ def test_validate_corpus( "--test_transcriptions", "--phone_confidence", ] + command = [str(x) for x in command] result = click.testing.CliRunner(mix_stderr=False, echo_stdin=True).invoke( mfa_cli, command, catch_exceptions=True ) @@ -59,6 +60,7 @@ def test_validate_training_corpus( "--test_transcriptions", "--phone_confidence", ] + command = [str(x) for x in command] result = click.testing.CliRunner(mix_stderr=False, echo_stdin=True).invoke( mfa_cli, command, catch_exceptions=True ) @@ -89,6 +91,7 @@ def test_validate_xsampa( "--config_path", xsampa_train_config_path, ] + command = [str(x) for x in command] result = click.testing.CliRunner(mix_stderr=False, echo_stdin=True).invoke( mfa_cli, command, catch_exceptions=True ) @@ -116,6 +119,7 @@ def test_validate_dictionary( "-j", "1", ] + command = [str(x) for x in command] result = click.testing.CliRunner(mix_stderr=False, echo_stdin=True).invoke( mfa_cli, command, catch_exceptions=True ) @@ -138,6 +142,7 @@ def test_validate_dictionary_train( "-t", os.path.join(temp_dir, "dictionary_validation_train"), ] + command = [str(x) for x in command] result = click.testing.CliRunner(mix_stderr=False, echo_stdin=True).invoke( mfa_cli, command, catch_exceptions=True ) diff --git a/tests/test_corpus.py b/tests/test_corpus.py index 15ee9368..8c7e7d74 100644 --- a/tests/test_corpus.py +++ b/tests/test_corpus.py @@ -13,19 +13,19 @@ def test_mp3(mp3_test_path): - info = get_wav_info(mp3_test_path) + info = get_wav_info(str(mp3_test_path)) assert info.sox_string assert info.duration > 0 def test_opus(opus_test_path): - info = get_wav_info(opus_test_path) + info = get_wav_info(str(opus_test_path)) assert info.sox_string assert info.duration > 0 def test_add(basic_corpus_dir, generated_dir, global_config, db_setup): - output_directory = os.path.join(generated_dir, "corpus_tests") + output_directory = generated_dir.joinpath("corpus_tests") global_config.temporary_directory = output_directory corpus = AcousticCorpus( corpus_directory=basic_corpus_dir, @@ -58,7 +58,7 @@ def test_add(basic_corpus_dir, generated_dir, global_config, db_setup): def test_basic_txt(basic_corpus_txt_dir, basic_dict_path, generated_dir, db_setup): - output_directory = os.path.join(generated_dir, "corpus_tests") + output_directory = generated_dir.joinpath("corpus_tests") if os.path.exists(output_directory): shutil.rmtree(output_directory, ignore_errors=True) corpus = AcousticCorpus( @@ -73,7 +73,7 @@ def test_basic_txt(basic_corpus_txt_dir, basic_dict_path, generated_dir, db_setu def test_acoustic_from_temp( basic_corpus_txt_dir, basic_dict_path, generated_dir, global_config, db_setup ): - output_directory = os.path.join(generated_dir, "corpus_tests") + output_directory = generated_dir.joinpath("corpus_tests") if os.path.exists(output_directory): shutil.rmtree(output_directory, ignore_errors=True) global_config.temporary_directory = output_directory @@ -97,7 +97,7 @@ def test_acoustic_from_temp( def test_text_corpus_from_temp( basic_corpus_txt_dir, basic_dict_path, generated_dir, global_config, db_setup ): - output_directory = os.path.join(generated_dir, "corpus_tests") + output_directory = generated_dir.joinpath("corpus_tests") if os.path.exists(output_directory): shutil.rmtree(output_directory, ignore_errors=True) global_config.temporary_directory = output_directory @@ -117,7 +117,7 @@ def test_text_corpus_from_temp( def test_extra(basic_dict_path, extra_corpus_dir, generated_dir, global_config, db_setup): - output_directory = os.path.join(generated_dir, "corpus_tests", "extra") + output_directory = generated_dir.joinpath("corpus_tests", "extra") if os.path.exists(output_directory): shutil.rmtree(output_directory, ignore_errors=True) global_config.temporary_directory = output_directory @@ -133,7 +133,7 @@ def test_extra(basic_dict_path, extra_corpus_dir, generated_dir, global_config, def test_stereo(basic_dict_path, stereo_corpus_dir, generated_dir, global_config, db_setup): - output_directory = os.path.join(generated_dir, "corpus_tests", "stereo") + output_directory = generated_dir.joinpath("corpus_tests", "stereo") if os.path.exists(output_directory): shutil.rmtree(output_directory, ignore_errors=True) global_config.temporary_directory = output_directory @@ -150,7 +150,7 @@ def test_stereo(basic_dict_path, stereo_corpus_dir, generated_dir, global_config def test_stereo_short_tg( basic_dict_path, stereo_corpus_short_tg_dir, generated_dir, global_config, db_setup ): - output_directory = os.path.join(generated_dir, "corpus_tests", "stereo_short") + output_directory = generated_dir.joinpath("corpus_tests", "stereo_short") if os.path.exists(output_directory): shutil.rmtree(output_directory, ignore_errors=True) global_config.temporary_directory = output_directory @@ -166,7 +166,7 @@ def test_stereo_short_tg( def test_audio_directory(basic_dict_path, basic_split_dir, generated_dir, global_config, db_setup): audio_dir, text_dir = basic_split_dir - output_directory = os.path.join(generated_dir, "corpus_tests", "audio_dir") + output_directory = generated_dir.joinpath("corpus_tests", "audio_dir") if os.path.exists(output_directory): shutil.rmtree(output_directory, ignore_errors=True) global_config.temporary_directory = output_directory @@ -182,7 +182,7 @@ def test_audio_directory(basic_dict_path, basic_split_dir, generated_dir, global def test_flac(basic_dict_path, flac_corpus_dir, generated_dir, global_config, db_setup): - output_directory = os.path.join(generated_dir, "corpus_tests", "flac") + output_directory = generated_dir.joinpath("corpus_tests", "flac") if os.path.exists(output_directory): shutil.rmtree(output_directory, ignore_errors=True) global_config.temporary_directory = output_directory @@ -197,7 +197,7 @@ def test_flac(basic_dict_path, flac_corpus_dir, generated_dir, global_config, db def test_flac_mp(basic_dict_path, flac_corpus_dir, generated_dir, global_config, db_setup): - output_directory = os.path.join(generated_dir, "corpus_tests", "flac_mp") + output_directory = generated_dir.joinpath("corpus_tests", "flac_mp") if os.path.exists(output_directory): shutil.rmtree(output_directory, ignore_errors=True) global_config.use_mp = True @@ -213,7 +213,7 @@ def test_flac_mp(basic_dict_path, flac_corpus_dir, generated_dir, global_config, def test_flac_tg(basic_dict_path, flac_tg_corpus_dir, generated_dir, global_config, db_setup): - output_directory = os.path.join(generated_dir, "corpus_tests", "flac_no_mp") + output_directory = generated_dir.joinpath("corpus_tests", "flac_no_mp") if os.path.exists(output_directory): shutil.rmtree(output_directory, ignore_errors=True) global_config.temporary_directory = output_directory @@ -229,7 +229,7 @@ def test_flac_tg(basic_dict_path, flac_tg_corpus_dir, generated_dir, global_conf def test_flac_tg_mp(basic_dict_path, flac_tg_corpus_dir, generated_dir, global_config, db_setup): - output_directory = os.path.join(generated_dir, "corpus_tests", "flac_tg_mp") + output_directory = generated_dir.joinpath("corpus_tests", "flac_tg_mp") global_config.temporary_directory = output_directory global_config.use_mp = True if os.path.exists(output_directory): @@ -247,7 +247,7 @@ def test_flac_tg_mp(basic_dict_path, flac_tg_corpus_dir, generated_dir, global_c def test_24bit_wav( transcribe_corpus_24bit_dir, basic_dict_path, generated_dir, global_config, db_setup ): - output_directory = os.path.join(generated_dir, "corpus_tests", "24bit") + output_directory = generated_dir.joinpath("corpus_tests", "24bit") global_config.temporary_directory = output_directory if os.path.exists(output_directory): shutil.rmtree(output_directory, ignore_errors=True) @@ -262,7 +262,7 @@ def test_24bit_wav( def test_short_segments(shortsegments_corpus_dir, generated_dir, global_config, db_setup): - output_directory = os.path.join(generated_dir, "corpus_tests", "short_segments") + output_directory = generated_dir.joinpath("corpus_tests", "short_segments") global_config.temporary_directory = output_directory if os.path.exists(output_directory): shutil.rmtree(output_directory, ignore_errors=True) @@ -281,7 +281,7 @@ def test_short_segments(shortsegments_corpus_dir, generated_dir, global_config, def test_speaker_groupings( multilingual_ipa_corpus_dir, generated_dir, english_us_mfa_dictionary, global_config, db_setup ): - output_directory = os.path.join(generated_dir, "corpus_tests", "speaker_groupings") + output_directory = generated_dir.joinpath("corpus_tests", "speaker_groupings") global_config.temporary_directory = output_directory if os.path.exists(output_directory): shutil.rmtree(output_directory, ignore_errors=True) @@ -322,7 +322,7 @@ def test_speaker_groupings( def test_subset(multilingual_ipa_corpus_dir, generated_dir, global_config, db_setup): - output_directory = os.path.join(generated_dir, "corpus_tests", "subset") + output_directory = generated_dir.joinpath("corpus_tests", "subset") global_config.temporary_directory = output_directory if os.path.exists(output_directory): shutil.rmtree(output_directory, ignore_errors=True) @@ -339,7 +339,7 @@ def test_subset(multilingual_ipa_corpus_dir, generated_dir, global_config, db_se def test_weird_words(weird_words_dir, generated_dir, basic_dict_path, global_config, db_setup): - output_directory = os.path.join(generated_dir, "corpus_tests", "weird_words") + output_directory = generated_dir.joinpath("corpus_tests", "weird_words") global_config.temporary_directory = output_directory if os.path.exists(output_directory): shutil.rmtree(output_directory, ignore_errors=True) @@ -432,7 +432,7 @@ def test_weird_words(weird_words_dir, generated_dir, basic_dict_path, global_con def test_punctuated( punctuated_dir, generated_dir, english_us_mfa_dictionary, global_config, db_setup ): - output_directory = os.path.join(generated_dir, "corpus_tests", "punctuated") + output_directory = generated_dir.joinpath("corpus_tests", "punctuated") global_config.temporary_directory = output_directory if os.path.exists(output_directory): shutil.rmtree(output_directory, ignore_errors=True) @@ -466,7 +466,7 @@ def test_alternate_punctuation( ): from montreal_forced_aligner.acoustic_modeling.trainer import TrainableAligner - output_directory = os.path.join(generated_dir, "corpus_tests", "alternate") + output_directory = generated_dir.joinpath("corpus_tests", "alternate") global_config.temporary_directory = output_directory if os.path.exists(output_directory): shutil.rmtree(output_directory, ignore_errors=True) @@ -496,7 +496,7 @@ def test_no_punctuation( ): from montreal_forced_aligner.acoustic_modeling.trainer import TrainableAligner - output_directory = os.path.join(generated_dir, "corpus_tests", "no_punctuation") + output_directory = generated_dir.joinpath("corpus_tests", "no_punctuation") global_config.temporary_directory = output_directory if os.path.exists(output_directory): shutil.rmtree(output_directory, ignore_errors=True) @@ -553,7 +553,7 @@ def test_xsampa_corpus( ): from montreal_forced_aligner.acoustic_modeling.trainer import TrainableAligner - output_directory = os.path.join(generated_dir, "corpus_tests", "xsampa") + output_directory = generated_dir.joinpath("corpus_tests", "xsampa") global_config.temporary_directory = output_directory if os.path.exists(output_directory): shutil.rmtree(output_directory, ignore_errors=True) @@ -574,7 +574,7 @@ def test_xsampa_corpus( def test_japanese(japanese_dir, japanese_dict_path, generated_dir, global_config, db_setup): - output_directory = os.path.join(generated_dir, "corpus_tests", "japanese") + output_directory = generated_dir.joinpath("corpus_tests", "japanese") global_config.temporary_directory = output_directory if os.path.exists(output_directory): shutil.rmtree(output_directory, ignore_errors=True) @@ -592,7 +592,7 @@ def test_japanese(japanese_dir, japanese_dict_path, generated_dir, global_config def test_devanagari(devanagari_dir, hindi_dict_path, generated_dir, global_config, db_setup): - output_directory = os.path.join(generated_dir, "corpus_tests", "devanagari") + output_directory = generated_dir.joinpath("corpus_tests", "devanagari") global_config.temporary_directory = output_directory if os.path.exists(output_directory): shutil.rmtree(output_directory, ignore_errors=True) @@ -610,7 +610,7 @@ def test_devanagari(devanagari_dir, hindi_dict_path, generated_dir, global_confi def test_french_clitics( french_clitics_dir, frclitics_dict_path, generated_dir, global_config, db_setup ): - output_directory = os.path.join(generated_dir, "corpus_tests", "french_clitics") + output_directory = generated_dir.joinpath("corpus_tests", "french_clitics") global_config.temporary_directory = output_directory if os.path.exists(output_directory): shutil.rmtree(output_directory, ignore_errors=True) diff --git a/tests/test_dict.py b/tests/test_dict.py index cadbd242..af24a61d 100644 --- a/tests/test_dict.py +++ b/tests/test_dict.py @@ -1,4 +1,3 @@ -import os import shutil from montreal_forced_aligner.db import Pronunciation @@ -6,7 +5,7 @@ def test_abstract(abstract_dict_path, generated_dir, global_config, db_setup): - output_directory = os.path.join(generated_dir, "dictionary_tests", "abstract") + output_directory = generated_dir.joinpath("dictionary_tests", "abstract") global_config.temporary_directory = output_directory shutil.rmtree(output_directory, ignore_errors=True) dictionary = MultispeakerDictionary( @@ -32,7 +31,7 @@ def test_abstract(abstract_dict_path, generated_dir, global_config, db_setup): def test_tabbed(tabbed_dict_path, basic_dict_path, generated_dir, global_config, db_setup): - output_directory = os.path.join(generated_dir, "dictionary_tests", "tabbed") + output_directory = generated_dir.joinpath("dictionary_tests", "tabbed") global_config.temporary_directory = output_directory shutil.rmtree(output_directory, ignore_errors=True) tabbed_dictionary = MultispeakerDictionary(dictionary_path=tabbed_dict_path) @@ -45,7 +44,7 @@ def test_tabbed(tabbed_dict_path, basic_dict_path, generated_dir, global_config, def test_extra_annotations(extra_annotations_path, generated_dir, global_config, db_setup): - output_directory = os.path.join(generated_dir, "dictionary_tests", "extras") + output_directory = generated_dir.joinpath("dictionary_tests", "extras") global_config.temporary_directory = output_directory shutil.rmtree(output_directory, ignore_errors=True) dictionary = MultispeakerDictionary(dictionary_path=extra_annotations_path) @@ -59,7 +58,7 @@ def test_extra_annotations(extra_annotations_path, generated_dir, global_config, def test_abstract_noposition(abstract_dict_path, generated_dir, global_config, db_setup): - output_directory = os.path.join(generated_dir, "dictionary_tests", "abstract_no_position") + output_directory = generated_dir.joinpath("dictionary_tests", "abstract_no_position") global_config.temporary_directory = output_directory shutil.rmtree(output_directory, ignore_errors=True) dictionary = MultispeakerDictionary( @@ -72,7 +71,7 @@ def test_abstract_noposition(abstract_dict_path, generated_dir, global_config, d def test_english_clitics(english_dictionary, generated_dir, global_config, db_setup): - output_directory = os.path.join(generated_dir, "dictionary_tests", "english_clitics") + output_directory = generated_dir.joinpath("dictionary_tests", "english_clitics") global_config.temporary_directory = output_directory shutil.rmtree(output_directory, ignore_errors=True) dictionary = MultispeakerDictionary( @@ -103,7 +102,7 @@ def test_english_clitics(english_dictionary, generated_dir, global_config, db_se def test_english_mfa(english_us_mfa_dictionary, generated_dir, global_config, db_setup): - output_directory = os.path.join(generated_dir, "dictionary_tests", "english_mfa") + output_directory = generated_dir.joinpath("dictionary_tests", "english_mfa") global_config.temporary_directory = output_directory shutil.rmtree(output_directory, ignore_errors=True) dictionary = MultispeakerDictionary( @@ -125,7 +124,7 @@ def test_english_mfa(english_us_mfa_dictionary, generated_dir, global_config, db def test_mandarin_pinyin(pinyin_dictionary, generated_dir, global_config, db_setup): - output_directory = os.path.join(generated_dir, "dictionary_tests", "pinyin") + output_directory = generated_dir.joinpath("dictionary_tests", "pinyin") global_config.temporary_directory = output_directory shutil.rmtree(output_directory, ignore_errors=True) dictionary = MultispeakerDictionary( @@ -163,7 +162,7 @@ def test_mandarin_pinyin(pinyin_dictionary, generated_dir, global_config, db_set def test_multispeaker_config( multispeaker_dictionary_config_path, generated_dir, global_config, db_setup ): - output_directory = os.path.join(generated_dir, "dictionary_tests", "multispeaker") + output_directory = generated_dir.joinpath("dictionary_tests", "multispeaker") global_config.temporary_directory = output_directory shutil.rmtree(output_directory, ignore_errors=True) dictionary = MultispeakerDictionary( @@ -176,7 +175,7 @@ def test_multispeaker_config( def test_mixed_dictionary(mixed_dict_path, generated_dir, global_config, db_setup): - output_directory = os.path.join(generated_dir, "dictionary_tests", "mixed") + output_directory = generated_dir.joinpath("dictionary_tests", "mixed") global_config.temporary_directory = output_directory shutil.rmtree(output_directory, ignore_errors=True) dictionary = MultispeakerDictionary( @@ -214,7 +213,7 @@ def test_mixed_dictionary(mixed_dict_path, generated_dir, global_config, db_setu def test_vietnamese_tones(vietnamese_dict_path, generated_dir, global_config, db_setup): - output_directory = os.path.join(generated_dir, "dictionary_tests", "vietnamese") + output_directory = generated_dir.joinpath("dictionary_tests", "vietnamese") global_config.temporary_directory = output_directory shutil.rmtree(output_directory, ignore_errors=True) dictionary = MultispeakerDictionary( diff --git a/tests/test_gui.py b/tests/test_gui.py index 38a82a89..1a6217ea 100644 --- a/tests/test_gui.py +++ b/tests/test_gui.py @@ -1,10 +1,8 @@ -import os - from montreal_forced_aligner.corpus.acoustic_corpus import AcousticCorpus def test_save_text_lab(basic_corpus_dir, generated_dir, global_config, db_setup): - output_directory = os.path.join(generated_dir, "gui_tests") + output_directory = generated_dir.joinpath("gui_tests") global_config.temporary_directory = output_directory corpus = AcousticCorpus( corpus_directory=basic_corpus_dir, @@ -19,7 +17,7 @@ def test_file_properties( global_config, db_setup, ): - output_directory = os.path.join(generated_dir, "gui_tests") + output_directory = generated_dir.joinpath("gui_tests") global_config.temporary_directory = output_directory corpus = AcousticCorpus( corpus_directory=stereo_corpus_dir, @@ -34,7 +32,7 @@ def test_file_properties( def test_flac_tg(flac_tg_corpus_dir, generated_dir, global_config, db_setup): - output_directory = os.path.join(generated_dir, "gui_tests") + output_directory = generated_dir.joinpath("gui_tests") global_config.temporary_directory = output_directory corpus = AcousticCorpus( corpus_directory=flac_tg_corpus_dir,