Skip to content

Commit

Permalink
2.2.0: Add support for tokenizers (#566)
Browse files Browse the repository at this point in the history
* Add support for tokenizers
  • Loading branch information
mmcauliffe authored Feb 15, 2023
1 parent 5304068 commit e88a65e
Show file tree
Hide file tree
Showing 84 changed files with 2,759 additions and 1,495 deletions.
14 changes: 14 additions & 0 deletions docs/source/reference/tokenization/helper.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
Helper functionality
====================

Helper
------

.. currentmodule:: montreal_forced_aligner.tokenization.tokenizer

.. autosummary::
:toctree: generated/

TokenizerRewriter
TokenizerArguments
TokenizerFunction
20 changes: 20 additions & 0 deletions docs/source/reference/tokenization/index.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@

.. _tokenization_api:

Tokenizers
==========

Tokenizers allow for adding spaces as word boundaries for orthographic systems that don't normally use them (i.e., Japanese, Chinese, Thai).

.. currentmodule:: montreal_forced_aligner.models

.. autosummary::
:toctree: generated/

TokenizerModel

.. toctree::

training
tokenizer
helper
13 changes: 13 additions & 0 deletions docs/source/reference/tokenization/tokenizer.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@

.. _tokenizer_api:

Corpus tokenizer
=================

.. currentmodule:: montreal_forced_aligner.tokenization.tokenizer

.. autosummary::
:toctree: generated/

CorpusTokenizer
TokenizerValidator
12 changes: 12 additions & 0 deletions docs/source/reference/tokenization/training.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@

.. _tokenizer_model_training_api:

Training tokenizer models
=========================

.. currentmodule:: montreal_forced_aligner.tokenization.trainer

.. autosummary::
:toctree: generated/

TokenizerTrainer -- Trainer for language model on text corpora
1 change: 1 addition & 0 deletions docs/source/reference/top_level_index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,4 @@ Workflows
transcription/index
segmentation/index
diarization/index
tokenization/index
2 changes: 2 additions & 0 deletions docs/source/user_guide/corpus_creation/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -24,4 +24,6 @@ MFA now contains several command line utilities for helping to create corpora fr
transcribing
training_lm
training_dictionary
tokenize
train_tokenizer
anchor
20 changes: 20 additions & 0 deletions docs/source/user_guide/corpus_creation/tokenize.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@

.. _tokenize_cli:

Tokenize utterances ``(mfa tokenize)``
=========================================

Use a model trained from :ref:`train_tokenizer_cli` to tokenize a corpus (i.e. insert spaces as word boundaries for orthographic systems that do not require them).

Command reference
-----------------

.. click:: montreal_forced_aligner.command_line.tokenize:tokenize_cli
:prog: mfa tokenize
:nested: full


API reference
-------------

- :ref:`tokenization_api`
24 changes: 24 additions & 0 deletions docs/source/user_guide/corpus_creation/train_tokenizer.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@

.. _train_tokenizer_cli:

Train a word tokenizer ``(mfa train_tokenizer)``
================================================

Training a tokenizer uses a simplified sequence-to-sequence model like G2P, but with the following differences:

* Both the input and output symbols are graphemes
* Symbols can only output themselves
* Only allow for inserting space characters

Command reference
-----------------

.. click:: montreal_forced_aligner.command_line.train_tokenizer:train_tokenizer_cli
:prog: mfa train_tokenizer
:nested: full


API reference
-------------

- :ref:`tokenization_api`
46 changes: 23 additions & 23 deletions montreal_forced_aligner/abc.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ def data_source_identifier(self) -> str:

@property
@abc.abstractmethod
def output_directory(self) -> str:
def output_directory(self) -> Path:
"""Root temporary directory"""
...

Expand All @@ -153,52 +153,52 @@ def clean_working_directory(self) -> None:
shutil.rmtree(self.output_directory, ignore_errors=True)

@property
def corpus_output_directory(self) -> str:
def corpus_output_directory(self) -> Path:
"""Temporary directory containing all corpus information"""
if self._corpus_output_directory:
return self._corpus_output_directory
return os.path.join(self.output_directory, f"{self.data_source_identifier}")
return self.output_directory.joinpath(f"{self.data_source_identifier}")

@corpus_output_directory.setter
def corpus_output_directory(self, directory: str) -> None:
def corpus_output_directory(self, directory: Path) -> None:
self._corpus_output_directory = directory

@property
def dictionary_output_directory(self) -> str:
def dictionary_output_directory(self) -> Path:
"""Temporary directory containing all dictionary information"""
if self._dictionary_output_directory:
return self._dictionary_output_directory
return os.path.join(self.output_directory, "dictionary")
return self.output_directory.joinpath("dictionary")

@property
def model_output_directory(self) -> str:
def model_output_directory(self) -> Path:
"""Temporary directory containing all dictionary information"""
return os.path.join(self.output_directory, "models")
return self.output_directory.joinpath("models")

@dictionary_output_directory.setter
def dictionary_output_directory(self, directory: str) -> None:
def dictionary_output_directory(self, directory: Path) -> None:
self._dictionary_output_directory = directory

@property
def language_model_output_directory(self) -> str:
def language_model_output_directory(self) -> Path:
"""Temporary directory containing all dictionary information"""
if self._language_model_output_directory:
return self._language_model_output_directory
return os.path.join(self.model_output_directory, "language_model")
return self.model_output_directory.joinpath("language_model")

@language_model_output_directory.setter
def language_model_output_directory(self, directory: str) -> None:
def language_model_output_directory(self, directory: Path) -> None:
self._language_model_output_directory = directory

@property
def acoustic_model_output_directory(self) -> str:
def acoustic_model_output_directory(self) -> Path:
"""Temporary directory containing all dictionary information"""
if self._acoustic_model_output_directory:
return self._acoustic_model_output_directory
return os.path.join(self.model_output_directory, "acoustic_model")
return self.model_output_directory.joinpath("acoustic_model")

@acoustic_model_output_directory.setter
def acoustic_model_output_directory(self, directory: str) -> None:
def acoustic_model_output_directory(self, directory: Path) -> None:
self._acoustic_model_output_directory = directory


Expand Down Expand Up @@ -459,18 +459,18 @@ def configuration(self) -> MetaDict:

@property
@abc.abstractmethod
def working_directory(self) -> str:
def working_directory(self) -> Path:
"""Current working directory"""
...

@property
def working_log_directory(self) -> str:
def working_log_directory(self) -> Path:
"""Current working log directory"""
return os.path.join(self.working_directory, "log")
return self.working_directory.joinpath("log")

@property
@abc.abstractmethod
def data_directory(self) -> str:
def data_directory(self) -> Path:
"""Data directory"""
...

Expand Down Expand Up @@ -577,7 +577,7 @@ def parse_args(
@classmethod
def parse_parameters(
cls,
config_path: Optional[str] = None,
config_path: Optional[Path] = None,
args: Optional[Dict[str, Any]] = None,
unknown_args: Optional[typing.Iterable[str]] = None,
) -> MetaDict:
Expand All @@ -586,7 +586,7 @@ def parse_parameters(
Parameters
----------
config_path: str, optional
config_path: :class:`~pathlib.Path`, optional
Path to yaml configuration file
args: dict[str, Any]
Parsed arguments
Expand Down Expand Up @@ -777,13 +777,13 @@ def meta(self) -> MetaDict:
...

@abc.abstractmethod
def export_model(self, output_model_path: str) -> None:
def export_model(self, output_model_path: Path) -> None:
"""
Abstract method to export an MFA model
Parameters
----------
output_model_path: str
output_model_path: :class:`~pathlib.Path`
Path to export model
"""
...
Expand Down
Loading

0 comments on commit e88a65e

Please sign in to comment.